PyPI - scrapegoat-core - Versions diffs - 1.2.0__py3-none-any.whl - Mend

scrapegoat-core 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

scrapegoat_core/__init__.py +3 -0
scrapegoat_core/classes/__init__.py +13 -0
scrapegoat_core/classes/block.py +50 -0
scrapegoat_core/classes/command.py +205 -0
scrapegoat_core/classes/conditions.py +87 -0
scrapegoat_core/classes/gardener.py +145 -0
scrapegoat_core/classes/goat.py +28 -0
scrapegoat_core/classes/interpreter.py +457 -0
scrapegoat_core/classes/milkmaid.py +27 -0
scrapegoat_core/classes/milkman.py +32 -0
scrapegoat_core/classes/node.py +247 -0
scrapegoat_core/classes/sheepdog.py +81 -0
scrapegoat_core/classes/shepherd.py +108 -0
scrapegoat_core/cli.py +38 -0
scrapegoat_core/exceptions/__init__.py +0 -0
scrapegoat_core/main.py +0 -0
scrapegoat_core-1.2.0.dist-info/METADATA +22 -0
scrapegoat_core-1.2.0.dist-info/RECORD +22 -0
scrapegoat_core-1.2.0.dist-info/WHEEL +5 -0
scrapegoat_core-1.2.0.dist-info/entry_points.txt +2 -0
scrapegoat_core-1.2.0.dist-info/licenses/LICENSE +21 -0
scrapegoat_core-1.2.0.dist-info/top_level.txt +1 -0

scrapegoat_core/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .classes import Goat, HTMLNode, Condition, InCondition, IfCondition, Gardener, Interpeter, Command, Shepherd, Sheepdog, TokenType, Token, Tokenizer, Parser, ConditionParser, ScrapeSelectParser, ExtractParser, Milkmaid, Milkman, ChurnCommand, GrazeCommand, DeliverCommand, FetchCommand, VisitParser, FlagParser, GoatspeakBlock, Query, HeadlessSheepdog
+__all__ = ["Goat", "HTMLNode", "Condition", "InCondition", "IfCondition", "Gardener", "Interpeter", "Command", "Shepherd", "Sheepdog", "TokenType", "Token", "Tokenizer", "Parser", "ConditionParser", "ScrapeSelectParser", "ExtractParser", "Milkmaid", "Milkman", "ChurnCommand", "GrazeCommand", "DeliverCommand", "FetchCommand", "VisitParser", "FlagParser", "GoatspeakBlock", "Query", "HeadlessSheepdog"]

scrapegoat_core/classes/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .goat import Goat
+from .milkmaid import Milkmaid
+from .milkman import Milkman
+from .node import HTMLNode
+from .conditions import Condition, InCondition, IfCondition
+from .gardener import Gardener
+from .interpreter import Interpeter, TokenType, Token, Tokenizer, Parser, ConditionParser, ScrapeSelectParser, ExtractParser, VisitParser, FlagParser
+from .command import Command, GrazeCommand, ChurnCommand, DeliverCommand, FetchCommand
+from .shepherd import Shepherd
+from .sheepdog import Sheepdog, HeadlessSheepdog
+from .block import GoatspeakBlock, Query
+__all__ = ["Goat", "HTMLNode", "Condition", "InCondition", "IfCondition", "Gardener", "Interpeter", "Command", "GrazeCommand", "ChurnCommand", "DeliverCommand", "Shepherd", "Sheepdog", "Loom", "TokenType", "Token", "Tokenizer", "Parser", "ConditionParser", "ScrapeSelectParser", "ExtractParser", "Milkmaid", "Milkman", "FetchCommand", "VisitParser", "FlagParser", "GoatspeakBlock", "Query", "HeadlessSheepdog"]

scrapegoat_core/classes/block.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""
+"""
+class GoatspeakBlock:
+    """
+    """
+    def __init__(self, fetch_command, query_list):
+        """
+        """
+        self.fetch_command = fetch_command
+        self.query_list = query_list
+    def __repr__(self):
+        """
+        """
+        return f"GoatspeakBlock(fetch_command={self.fetch_command}, query_list={self.query_list})"
+    def to_goat_file(self) -> None:
+        """
+        """
+        pass
+class Query:
+    """
+    """
+    def __init__(self, graze_commands, fetch_command=None, churn_command=None, deliver_command=None):
+        """
+        """
+        self.fetch_command = fetch_command
+        self.graze_commands = graze_commands
+        self.churn_command = churn_command
+        self.deliver_command = deliver_command
+    def __repr__(self):
+        """
+        """
+        return f"Query(graze_commands={self.graze_commands}, fetch_command={self.fetch_command}, churn_command={self.churn_command}, deliver_command={self.deliver_command})"
+def main():
+    """
+    """
+    pass
+if __name__ == "__main__":
+    """
+    """
+    main()

scrapegoat_core/classes/command.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""
+"""
+from abc import ABC, abstractmethod
+import os
+import json
+import csv
+import requests
+from .conditions import InCondition
+class Command(ABC):
+    """
+    """
+    @abstractmethod
+    def __init__(self, action: str):
+        """
+        """
+        self.action = action
+    @abstractmethod
+    def execute(self, root) -> any:
+        """
+        """
+        pass
+class GrazeCommand(Command):
+    """
+    """
+    def __init__(self, action: str, count: int, element: str, conditions: list=None, flags: list=None):
+        """
+        """
+        super().__init__(action=action)
+        self.count = count
+        self.element = element
+        self.conditions = conditions or []
+        self.flags = flags or []
+        for cond in self.conditions:
+            if isinstance(cond, InCondition) and cond.target == "POSITION" and cond.query_tag is None:
+                cond.query_tag = self.element
+    def _evaluate(self, node, root) -> bool:
+        """
+        """
+        if node.tag_type != self.element:
+            return False
+        return all(cond.evaluate(node, root) for cond in self.conditions)
+    def execute(self, root) -> list:
+        """
+        """
+        results = []
+        for node in root.preorder_traversal():
+            if self._evaluate(node, root):
+                results.append(node)
+                if self.count > 0 and len(results) >= self.count:
+                    break
+        return results
+class ChurnCommand(Command):
+    """
+    """
+    def __init__(self, fields: list = None, ignore_children: bool = False, ignore_grandchildren: bool = False):
+        """
+        """
+        super().__init__(action="extract")
+        self.fields = fields
+        self.ignore_children = ignore_children
+        self.ignore_grandchildren = ignore_grandchildren
+    def execute(self, node) -> None:
+        """
+        """
+        node.set_extract_instructions(self.fields, self.ignore_children, self.ignore_grandchildren)
+class DeliverCommand(Command):
+    """
+    """
+    VALID_TYPES = {"csv", "json"}
+    def __init__(self, file_type: str, filepath: str = None, filename: str = None):
+        """
+        """
+        super().__init__(action="output")
+        self.file_type = file_type
+        self.filepath = filepath or os.getcwd()
+        base, ext = os.path.splitext(filename or f"output.{file_type}")
+        self.filename = base + (ext if ext else f".{file_type}")
+        self.full_path = os.path.join(self.filepath, self.filename)
+    def execute(self, nodes: list) -> str:
+        """
+        """
+        os.makedirs(self.filepath, exist_ok=True)
+        if self.file_type.lower() == "csv":
+            self._to_csv(nodes)
+        elif self.file_type.lower() == "json":
+            self._to_json(nodes)
+        return self.full_path
+    def _flatten_dict(self, d: dict, parent_key: str = '', sep: str = '.') -> dict:
+        """
+        """
+        items = {}
+        for k, v in d.items():
+            new_key = f"{k}" if parent_key else k
+            if isinstance(v, dict):
+                items.update(self._flatten_dict(v, new_key, sep=sep))
+            else:
+                items[new_key] = v
+        return items
+    def _collect_nodes(self, node_dict: dict, all_nodes: list) -> dict:
+        """
+        """
+        node_copy = node_dict.copy()
+        had_children = "children" in node_copy
+        children = node_copy.pop("children", [])
+        child_ids = []
+        for child in children:
+            child_flat = self._collect_nodes(child, all_nodes)
+            child_ids.append(child_flat.get("id"))
+        flattened = self._flatten_dict(node_copy)
+        if had_children:
+            if child_ids == [] or all(cid is None for cid in child_ids):
+                flattened["children"] = None
+            else:
+                flattened["children"] = child_ids
+        all_nodes.append(flattened)
+        return node_copy
+    def _to_csv(self, nodes: list) -> None:
+        """
+        """
+        all_nodes = []
+        for node in nodes:
+            node_dict = node.to_dict()
+            self._collect_nodes(node_dict, all_nodes)
+        fieldnames = set()
+        for nd in all_nodes:
+            fieldnames.update(nd.keys())
+        fieldnames = list(fieldnames)
+        os.makedirs(self.filepath, exist_ok=True)
+        with open(self.full_path, mode='w', newline='', encoding='utf-8') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            for nd in all_nodes:
+                writer.writerow(nd)
+    def _to_json(self, nodes: list) -> None:
+        """
+        """
+        nodes_as_dicts = [node.to_dict() for node in nodes]
+        with open(self.full_path, mode='w', encoding='utf-8') as jsonfile:
+            json.dump(nodes_as_dicts, jsonfile, indent=4)
+class FetchCommand(Command):
+    """
+    """
+    def __init__(self, url: str, **kwargs):
+        """
+        """
+        super().__init__(action="visit")
+        self.getter = requests.get
+        self.url = url
+        self.kwargs = kwargs
+    def execute(self) -> str:
+        """
+        """
+        return self.getter(self.url, **self.kwargs)
+    def set_getter(self, getter):
+        """
+        """
+        self.getter = getter
+    def __eq__(self, other):
+        """
+        """
+        return isinstance(other, FetchCommand) and self.url == other.url
+def main():
+    """
+    """
+    pass
+if __name__ == "__main__":
+    main()

scrapegoat_core/classes/conditions.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+"""
+from abc import ABC, abstractmethod
+class Condition(ABC):
+    """
+    """
+    def __init__(self, negated: bool = False):
+        """
+        """
+        self.negated = negated
+    @abstractmethod
+    def matches(self, node, root) -> bool:
+        """
+        """
+        pass
+    def evaluate(self, node, root) -> bool:
+        """
+        """
+        result = self.matches(node, root)
+        return not result if self.negated else result
+class IfCondition(Condition):
+    """
+    """
+    def __init__(self, key: str, value: str, negated: bool = False, query_tag: str = None):
+        """
+        """
+        super().__init__(negated)
+        self.key = key
+        self.value = value
+        self.query_tag = query_tag
+    def matches(self, node, _) -> bool:
+        """
+        """
+        if self.query_tag is None:
+            raise ValueError("query_tag is required for IF condition")
+        if self.key[0] == "@":
+            return node.has_html_attribute(self.key, self.value) and node.tag_type == self.query_tag
+        else:
+            return node.has_attribute(self.key, self.value) and node.tag_type == self.query_tag
+    def __str__(self):
+        """
+        """
+        return f"IfCondition(key={self.key}, value={self.value}, negated={self.negated}, query_tag={self.query_tag})"
+class InCondition(Condition):
+    """
+    """
+    def __init__(self, target: str, value=None, negated: bool = False, query_tag: str = None):
+        """
+        """
+        super().__init__(negated)
+        self.target = target
+        self.value = value
+        self.query_tag = query_tag
+    def matches(self, node, root) -> bool:
+        """
+        """
+        if self.target == "POSITION":
+            if not root:
+                raise ValueError("Root node is required for POSITION condition")
+            if not self.query_tag:
+                raise ValueError("query_tag is required for POSITION condition")
+            position = 1
+            for n in root.preorder_traversal():
+                if n.tag_type == self.query_tag:
+                    if node == n:
+                        return position == self.value
+                    position += 1
+            return False
+        else:
+            return node.is_descendant_of(self.target)
+    def __str__(self):
+        """
+        """
+        return f"InCondition(target={self.target}, value={self.value}, negated={self.negated}, query_tag={self.query_tag})"

scrapegoat_core/classes/gardener.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""
+"""
+# IMPORTS
+from html.parser import HTMLParser
+from .node import HTMLNode
+class Gardener(HTMLParser):
+    """
+    """
+    VOID_TAGS = {"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"}
+    AUTO_CLOSE = {
+        "li": {"li"},
+        "p": {"address", "article", "aside", "blockquote", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "main", "nav", "ol", "p", "pre", "section", "table", "ul"},
+        "dt": {"dt", "dd"},
+        "dd": {"dt", "dd"},
+        "tr": {"tr"},
+        "td": {"td", "th"},
+        "th": {"td", "th"}
+    }
+    INLINE_TAGS = {"b", "i", "strong", "em", "u", "small", "mark", "sub", "sup", "a", "span", "img", "br", "code", "s", "q", "cite"}
+    def __init__(self):
+        """
+        """
+        super().__init__()
+        self.tag_counts = {}
+        self.root = None
+        self.stack = []
+    def _auto_close_before(self, new_tag: str):
+        """
+        """
+        while self.stack:
+            current_node = next((n for n in reversed(self.stack) if n is not None), None)
+            if current_node is None:
+                break
+            current_tag = current_node.tag_type
+            if current_tag in self.AUTO_CLOSE and new_tag in self.AUTO_CLOSE[current_tag]:
+                while self.stack:
+                    popped = self.stack.pop()
+                    if popped is current_node:
+                        break
+            else:
+                break
+    def handle_starttag(self, tag_type, html_attributes):
+        """
+        """
+        self._auto_close_before(tag_type)
+        node = HTMLNode(raw=self.get_starttag_text(), tag_type=tag_type, html_attributes=dict(html_attributes))
+        node.is_inline = tag_type in self.INLINE_TAGS
+        self.tag_counts[tag_type] = self.tag_counts.get(tag_type, 0) + 1
+        node.set_retrieval_instructions(f"SCRAPE 1 {tag_type} IN POSITION={self.tag_counts[tag_type]};")
+        if self.root is None:
+            self.root = node
+            if tag_type not in self.VOID_TAGS:
+                self.stack.append(node)
+            return
+        parent = next((n for n in reversed(self.stack) if n is not None), self.root)
+        parent.children.append(node)
+        node.parent = parent
+        if tag_type not in self.VOID_TAGS:
+            self.stack.append(node)
+    def handle_endtag(self, tag_type):
+        """
+        """
+        for i in range(len(self.stack)-1, -1, -1):
+            if self.stack[i].tag_type == tag_type:
+                del self.stack[i:]
+                break
+        return
+    def handle_data(self, data):
+        """
+        """
+        stripped = data.strip()
+        if not stripped:
+            return
+        current = next((n for n in reversed(self.stack) if n is not None), self.root)
+        # Add text to current node
+        if current.body:
+            current.body += " " + stripped
+        else:
+            current.body = stripped
+        current.has_data = True
+        # Bubble text up if inline
+        if getattr(current, "is_inline", False) and current.parent is not None:
+            if current.parent.body:
+                current.parent.body += " " + stripped
+            else:
+                current.parent.body = stripped
+            current.parent.has_data = True
+    def _append_root_tag(self, raw_html: str) -> str:
+        """
+        """
+        html_lower = raw_html.lower()
+        if "<html" not in html_lower:
+            raw_html = f"<html>{raw_html}</html>"
+        if "<body" not in html_lower:
+            raw_html = raw_html.replace("<html>", "<html><body>", 1)
+            raw_html = raw_html.replace("</html>", "</body></html>", 1)
+        return raw_html
+    def grow_tree(self, raw_html: str) -> None:
+        """
+        """
+        self.root = None
+        self.stack = []
+        self.tag_counts = {}
+        self.reset()
+        wrapped_html = self._append_root_tag(raw_html)
+        self.feed(wrapped_html)
+        return self.root
+    def get_root(self) -> HTMLNode:
+        """
+        """
+        return self.root
+def main():
+    """
+    """
+    pass
+if __name__ == "__main__":
+    main()

scrapegoat_core/classes/goat.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""
+"""
+class Goat:
+    """
+    """
+    def __init__(self):
+        """
+        """
+        pass
+    def feast(self, root, graze_commands) -> list:
+        """
+        """
+        results = []
+        i = 0
+        while i < len(graze_commands):
+            graze_command = graze_commands[i]
+            if graze_command.action.lower() == "select":
+                rebased_roots = graze_command.execute(root)
+                graze_command_subset = graze_commands[i + 1:]
+                for new_root in rebased_roots:
+                    results.extend(self.feast(new_root, graze_command_subset))
+                return results
+            else:
+                results.extend(graze_command.execute(root))
+            i += 1
+        return results