sayou-connector 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ from .generator.file_generator import FileGenerator
2
+ from .generator.requests_generator import RequestsGenerator
3
+ from .generator.sqlite_generator import SqliteGenerator
4
+ from .pipeline import ConnectorPipeline
5
+
6
+ __all__ = [
7
+ "ConnectorPipeline",
8
+ "FileGenerator",
9
+ "RequestsGenerator",
10
+ "SqliteGenerator",
11
+ ]
@@ -0,0 +1,38 @@
1
+ from sayou.core.exceptions import SayouCoreError
2
+
3
+
4
+ class ConnectorError(SayouCoreError):
5
+ """
6
+ Base exception for all errors within the sayou-connector toolkit.
7
+
8
+ All specific exceptions in this module should inherit from this class
9
+ to allow catching connector-specific issues globally.
10
+ """
11
+
12
+ pass
13
+
14
+
15
+ class FetcherError(ConnectorError):
16
+ """
17
+ Exception raised when a Fetcher fails to retrieve data.
18
+
19
+ Examples:
20
+ - File not found on disk.
21
+ - HTTP connection timeout or 404/500 errors.
22
+ - Database connection failure.
23
+ """
24
+
25
+ pass
26
+
27
+
28
+ class GeneratorError(ConnectorError):
29
+ """
30
+ Exception raised when a Generator fails to produce tasks.
31
+
32
+ Examples:
33
+ - Invalid start path or configuration.
34
+ - Failure to parse a sitemap or initial seed page.
35
+ - Logic errors in the generation strategy.
36
+ """
37
+
38
+ pass
@@ -0,0 +1,42 @@
1
+ import os
2
+
3
+ from sayou.core.registry import register_component
4
+ from sayou.core.schemas import SayouTask
5
+
6
+ from ..interfaces.base_fetcher import BaseFetcher
7
+
8
+
9
+ @register_component("fetcher")
10
+ class FileFetcher(BaseFetcher):
11
+ """
12
+ Concrete implementation of BaseFetcher for local file systems.
13
+
14
+ This fetcher reads binary data directly from the path specified in `task.uri`.
15
+ It handles basic file I/O operations and raises wrapped exceptions if the file
16
+ is inaccessible or missing.
17
+ """
18
+
19
+ component_name = "FileFetcher"
20
+ SUPPORTED_TYPES = ["file"]
21
+
22
+ def _do_fetch(self, task: SayouTask) -> bytes:
23
+ """
24
+ Read a file from the local file system.
25
+
26
+ Args:
27
+ task (SayouTask): The task containing the file path in `task.uri`.
28
+
29
+ Returns:
30
+ bytes: The raw binary content of the file.
31
+
32
+ Raises:
33
+ FileNotFoundError: If the file does not exist.
34
+ IOError: If the file cannot be read.
35
+ """
36
+ file_path = task.uri
37
+
38
+ if not os.path.exists(file_path):
39
+ raise FileNotFoundError(f"File not found: {file_path}")
40
+
41
+ with open(file_path, "rb") as f:
42
+ return f.read()
@@ -0,0 +1,77 @@
1
+ from urllib.parse import urljoin
2
+
3
+ import requests
4
+
5
+ try:
6
+ from bs4 import BeautifulSoup
7
+ except ImportError:
8
+ BeautifulSoup = None
9
+
10
+ from sayou.core.registry import register_component
11
+ from sayou.core.schemas import SayouTask
12
+
13
+ from ..interfaces.base_fetcher import BaseFetcher
14
+
15
+
16
+ @register_component("fetcher")
17
+ class RequestsFetcher(BaseFetcher):
18
+ """
19
+ Concrete implementation of BaseFetcher for static web pages.
20
+
21
+ Retrieves HTML content via HTTP requests. It supports optional CSS selector
22
+ extraction (via `task.params['selectors']`) and automatically discovers
23
+ hyperlinks on the page to support the `WebCrawlGenerator` feedback loop.
24
+ """
25
+
26
+ component_name = "RequestsFetcher"
27
+ SUPPORTED_TYPES = ["requests"]
28
+
29
+ def _do_fetch(self, task: SayouTask) -> dict:
30
+ """
31
+ Fetch a web page and extract data/links.
32
+
33
+ Args:
34
+ task (SayouTask): The task containing the URL in `task.uri`.
35
+ `task.params` may contain 'selectors'.
36
+
37
+ Returns:
38
+ dict: A dictionary containing extracted text, raw preview,
39
+ and found links under '__found_links__'.
40
+
41
+ Raises:
42
+ requests.RequestException: For network-related errors.
43
+ ImportError: If BeautifulSoup is not installed.
44
+ """
45
+ if not BeautifulSoup:
46
+ raise ImportError("BeautifulSoup4 not installed.")
47
+
48
+ headers = {"User-Agent": "Sayou-Connector/0.1.0"}
49
+ resp = requests.get(task.uri, headers=headers, timeout=10)
50
+ resp.raise_for_status()
51
+
52
+ soup = BeautifulSoup(resp.text, "html.parser")
53
+ extracted_data = {}
54
+
55
+ # 1. Selectors logic
56
+ selectors = task.params.get("selectors", {})
57
+ if selectors:
58
+ for key, sel in selectors.items():
59
+ el = soup.select(sel)
60
+ if el:
61
+ extracted_data[key] = "\n".join(
62
+ [e.get_text(strip=True) for e in el]
63
+ )
64
+
65
+ if not extracted_data:
66
+ extracted_data["_raw_preview"] = resp.text[:200]
67
+
68
+ # 2. Link extraction logic
69
+ found_links = set()
70
+ for a in soup.find_all("a", href=True):
71
+ abs_link = urljoin(task.uri, a["href"])
72
+ if abs_link.startswith("http"):
73
+ found_links.add(abs_link)
74
+
75
+ extracted_data["__found_links__"] = list(found_links)
76
+
77
+ return extracted_data
@@ -0,0 +1,50 @@
1
+ import sqlite3
2
+ from typing import Any, Dict, List
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_fetcher import BaseFetcher
8
+
9
+
10
+ @register_component("fetcher")
11
+ class SqliteFetcher(BaseFetcher):
12
+ """
13
+ Concrete implementation of BaseFetcher for SQLite databases.
14
+
15
+ Connects to the SQLite database file specified in `task.uri` and executes
16
+ the SQL query provided in `task.params['query']`. It manages connection
17
+ lifecycles using context managers and returns results as a list of dictionaries.
18
+ """
19
+
20
+ component_name = "SqliteFetcher"
21
+ SUPPORTED_TYPES = ["sqlite"]
22
+
23
+ def _do_fetch(self, task: SayouTask) -> List[Dict[str, Any]]:
24
+ """
25
+ Execute a SQL query against a SQLite database.
26
+
27
+ Args:
28
+ task (SayouTask): The task containing the DB path in `task.uri`
29
+ and the SQL query in `task.params['query']`.
30
+
31
+ Returns:
32
+ List[Dict[str, Any]]: A list of rows, where each row is a dictionary.
33
+
34
+ Raises:
35
+ sqlite3.Error: If the database connection or query execution fails.
36
+ """
37
+ db_path = task.uri
38
+ query = task.params.get("query")
39
+
40
+ if not query:
41
+ raise ValueError("Query param is missing in SayouTask")
42
+
43
+ with sqlite3.connect(db_path) as conn:
44
+ conn.row_factory = sqlite3.Row
45
+ cursor = conn.cursor()
46
+
47
+ self._log(f"Executing query on {db_path}: {query[:50]}...", level="debug")
48
+
49
+ cursor.execute(query)
50
+ return [dict(row) for row in cursor.fetchall()]
@@ -0,0 +1,124 @@
1
+ import fnmatch
2
+ import os
3
+ from typing import Iterator
4
+
5
+ from sayou.core.registry import register_component
6
+ from sayou.core.schemas import SayouTask
7
+
8
+ from ..interfaces.base_generator import BaseGenerator
9
+
10
+
11
+ @register_component("generator")
12
+ class FileGenerator(BaseGenerator):
13
+ """
14
+ Concrete implementation of BaseGenerator for file system traversal.
15
+
16
+ Scans a directory tree starting from a source path. It yields `SayouTask`s
17
+ for files that match specific criteria, such as file extensions or name patterns.
18
+ Supports both recursive and flat directory scanning.
19
+ """
20
+
21
+ component_name = "FileGenerator"
22
+ SUPPORTED_TYPES = ["file"]
23
+
24
+ @classmethod
25
+ def can_handle(cls, source: str) -> float:
26
+ """
27
+ Evaluates whether this generator can handle the given source.
28
+
29
+ Analyzes the source string to determine if it matches the pattern or format
30
+ supported by this generator. Returns a confidence score between 0.0 and 1.0.
31
+
32
+ Args:
33
+ source (str): The input source string to evaluate.
34
+
35
+ Returns:
36
+ float: A confidence score where 1.0 means full confidence,
37
+ 0.0 means the source is incompatible, and intermediate values
38
+ indicate partial matches or heuristics.
39
+ """
40
+ if os.path.exists(source):
41
+ return 1.0
42
+ if source.startswith("/") or source.startswith("./") or ":\\" in source:
43
+ return 0.8
44
+ return 0.0
45
+
46
+ def initialize(
47
+ self,
48
+ source: str,
49
+ recursive: bool = True,
50
+ extensions: list = None,
51
+ name_pattern: str = "*",
52
+ **kwargs
53
+ ):
54
+ """
55
+ Configure the file scanning strategy.
56
+
57
+ Args:
58
+ source (str): The root directory or file path to start scanning.
59
+ recursive (bool): If True, scan subdirectories recursively.
60
+ extensions (Optional[List[str]]): List of allowed extensions (e.g., ['.pdf', '.txt']).
61
+ name_pattern (str): Glob pattern for filename matching (e.g., '*report*').
62
+ **kwargs: Ignored additional arguments.
63
+ """
64
+ self.root_path = os.path.abspath(source)
65
+ self.recursive = recursive
66
+ self.extensions = [e.lower() for e in extensions] if extensions else None
67
+ self.name_pattern = name_pattern
68
+
69
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
70
+ """
71
+ Walk through the file system and yield tasks for valid files.
72
+
73
+ Yields:
74
+ Iterator[SayouTask]: Tasks with `source_type='file'`.
75
+ """
76
+ if os.path.isfile(self.root_path):
77
+ if self._is_valid(self.root_path):
78
+ yield self._create_task(self.root_path)
79
+ return
80
+
81
+ walker = (
82
+ os.walk(self.root_path)
83
+ if self.recursive
84
+ else [(self.root_path, [], os.listdir(self.root_path))]
85
+ )
86
+
87
+ for root, _, files in walker:
88
+ for file in files:
89
+ full_path = os.path.join(root, file)
90
+ if self._is_valid(file):
91
+ yield self._create_task(full_path)
92
+
93
+ def _is_valid(self, filename: str) -> bool:
94
+ """
95
+ Check if a file matches the extension and name pattern criteria.
96
+
97
+ Args:
98
+ filename (str): The name of the file to check.
99
+
100
+ Returns:
101
+ bool: True if the file should be processed, False otherwise.
102
+ """
103
+ if not fnmatch.fnmatch(filename, self.name_pattern):
104
+ return False
105
+ if (
106
+ self.extensions
107
+ and os.path.splitext(filename)[1].lower() not in self.extensions
108
+ ):
109
+ return False
110
+ return True
111
+
112
+ def _create_task(self, path: str) -> SayouTask:
113
+ """
114
+ Create a SayouTask for a valid file path.
115
+
116
+ Args:
117
+ path (str): The absolute path to the file.
118
+
119
+ Returns:
120
+ SayouTask: The configured task object.
121
+ """
122
+ return SayouTask(
123
+ source_type="file", uri=path, meta={"filename": os.path.basename(path)}
124
+ )
@@ -0,0 +1,113 @@
1
+ import re
2
+ from collections import deque
3
+ from typing import Iterator
4
+
5
+ from sayou.core.registry import register_component
6
+ from sayou.core.schemas import SayouPacket, SayouTask
7
+
8
+ from ..interfaces.base_generator import BaseGenerator
9
+
10
+
11
+ @register_component("generator")
12
+ class RequestsGenerator(BaseGenerator):
13
+ """
14
+ Concrete implementation of BaseGenerator for web crawling.
15
+
16
+ Manages a frontier queue of URLs to visit. It starts from a seed URL and
17
+ dynamically adds new targets based on links discovered by the Fetcher (Feedback),
18
+ respecting maximum depth and URL pattern constraints.
19
+ """
20
+
21
+ component_name = "RequestsGenerator"
22
+ SUPPORTED_TYPES = ["requests"]
23
+
24
+ @classmethod
25
+ def can_handle(cls, source: str) -> float:
26
+ """
27
+ Evaluates whether this generator can handle the given source.
28
+
29
+ Analyzes the source string to determine if it matches the pattern or format
30
+ supported by this generator. Returns a confidence score between 0.0 and 1.0.
31
+
32
+ Args:
33
+ source (str): The input source string to evaluate.
34
+
35
+ Returns:
36
+ float: A confidence score where 1.0 means full confidence,
37
+ 0.0 means the source is incompatible, and intermediate values
38
+ indicate partial matches or heuristics.
39
+ """
40
+ s = source.strip().lower()
41
+
42
+ if s.startswith("http://") or s.startswith("https://"):
43
+ return 1.0
44
+
45
+ if s.startswith("www."):
46
+ return 0.8
47
+
48
+ return 0.0
49
+
50
+ def initialize(
51
+ self,
52
+ source: str,
53
+ link_pattern: str = ".*",
54
+ selectors: dict = None,
55
+ max_depth: int = 1,
56
+ **kwargs,
57
+ ):
58
+ """
59
+ Configure the web crawling strategy.
60
+
61
+ Args:
62
+ source (str): The seed URL to start crawling.
63
+ link_pattern (str): Regex pattern to filter links to follow.
64
+ selectors (Optional[dict]): CSS selectors to extract specific data from pages.
65
+ max_depth (int): Maximum depth to traverse from the seed URL.
66
+ **kwargs: Ignored additional arguments.
67
+ """
68
+ self.queue = deque([(source, 0)])
69
+ self.visited = {source}
70
+ self.link_regex = re.compile(link_pattern)
71
+ self.selectors = selectors or {}
72
+ self.max_depth = max_depth
73
+
74
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
75
+ """
76
+ Yield tasks from the crawling queue.
77
+
78
+ Yields:
79
+ Iterator[SayouTask]: Tasks for URLs in the queue.
80
+ """
81
+ while self.queue:
82
+ url, depth = self.queue.popleft()
83
+ yield SayouTask(
84
+ source_type="requests",
85
+ uri=url,
86
+ params={"selectors": self.selectors, "depth": depth},
87
+ )
88
+
89
+ def _do_feedback(self, result: SayouPacket):
90
+ """
91
+ Extract links from the fetched page and add them to the queue.
92
+
93
+ Args:
94
+ result (SayouPacket): The result containing extracted links ('__found_links__').
95
+ """
96
+ if not result.success or not result.data:
97
+ return
98
+
99
+ current_depth = result.task.params.get("depth", 0)
100
+ if current_depth >= self.max_depth:
101
+ return
102
+
103
+ links = result.data.get("__found_links__", [])
104
+ new_links = 0
105
+
106
+ for link in links:
107
+ if link not in self.visited and self.link_regex.search(link):
108
+ self.visited.add(link)
109
+ self.queue.append((link, current_depth + 1))
110
+ new_links += 1
111
+
112
+ if new_links > 0:
113
+ self._log(f"Added {new_links} new links (Depth {current_depth+1})")
@@ -0,0 +1,140 @@
1
+ import os
2
+ from typing import Iterator
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouPacket, SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+
10
+ @register_component("generator")
11
+ class SqliteGenerator(BaseGenerator):
12
+ """
13
+ Concrete implementation of BaseGenerator for SQL pagination.
14
+
15
+ Generates a sequence of database query tasks using LIMIT and OFFSET strategies.
16
+ It continues to yield tasks by incrementing the offset until the Fetcher returns
17
+ an empty result or a partial batch, indicating the end of the dataset.
18
+ """
19
+
20
+ component_name = "SqliteGenerator"
21
+ SUPPORTED_TYPES = ["sqlite"]
22
+
23
+ @classmethod
24
+ def can_handle(cls, source: str) -> float:
25
+ """
26
+ Evaluates whether this generator can handle the given source.
27
+
28
+ Analyzes the source string to determine if it matches the pattern or format
29
+ supported by this generator. Returns a confidence score between 0.0 and 1.0.
30
+
31
+ Args:
32
+ source (str): The input source string to evaluate.
33
+
34
+ Returns:
35
+ float: A confidence score where 1.0 means full confidence,
36
+ 0.0 means the source is incompatible, and intermediate values
37
+ indicate partial matches or heuristics.
38
+ """
39
+ s = source.strip()
40
+
41
+ if s.lower().startswith("sqlite:///"):
42
+ return 1.0
43
+
44
+ if any(s.lower().endswith(ext) for ext in [".db", ".sqlite", ".sqlite3"]):
45
+ if os.path.isfile(s):
46
+ return 1.0
47
+ return 0.9
48
+
49
+ return 0.0
50
+
51
+ def initialize(
52
+ self,
53
+ source: str,
54
+ query: str = None,
55
+ batch_size: int = 1000,
56
+ **kwargs,
57
+ ):
58
+ """
59
+ Configure the SQL scanning strategy with pagination.
60
+
61
+ Args:
62
+ source (str): The database connection string or file path.
63
+ query (str): The base SQL query (without LIMIT/OFFSET).
64
+ batch_size (int): Number of rows to fetch per task.
65
+ **kwargs: Ignored additional arguments.
66
+ """
67
+ self.conn_str = self._clean_source(source)
68
+ self.base_query = (
69
+ query.strip().rstrip(";")
70
+ if query
71
+ else "SELECT name FROM sqlite_master WHERE type='table'"
72
+ )
73
+ self.batch_size = batch_size
74
+ self.current_offset = 0
75
+ self.stop_flag = False
76
+
77
+ def _clean_source(self, source: str) -> str:
78
+ """
79
+ Extracts the actual file path from a source URI.
80
+
81
+ Removes prefixes like 'sqlite:///' or 'sqlite://' from the source string
82
+ to return a clean file path compatible with the standard `sqlite3` connect method.
83
+
84
+ Args:
85
+ source (str): The input source string (e.g., 'sqlite:///data/test.db').
86
+
87
+ Returns:
88
+ str: The cleaned file path (e.g., '/data/test.db').
89
+ """
90
+ s = source.strip()
91
+ if s.lower().startswith("sqlite:///"):
92
+ return s[10:]
93
+ elif s.lower().startswith("sqlite://"):
94
+ return s[9:]
95
+ return s
96
+
97
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
98
+ """
99
+ Yield pagination tasks until the stop flag is set.
100
+
101
+ Yields:
102
+ Iterator[SayouTask]: Tasks with `source_type='sqlite'` and pagination params.
103
+ """
104
+ while not self.stop_flag:
105
+ paginated_query = f"{self.base_query} LIMIT {self.batch_size} OFFSET {self.current_offset}"
106
+
107
+ task = SayouTask(
108
+ source_type="sqlite",
109
+ uri=self.conn_str,
110
+ params={"query": paginated_query},
111
+ meta={"offset": self.current_offset, "batch": self.batch_size},
112
+ )
113
+
114
+ yield task
115
+
116
+ self.current_offset += self.batch_size
117
+
118
+ def _do_feedback(self, result: SayouPacket):
119
+ """
120
+ Determine if pagination should stop based on the fetch result.
121
+
122
+ If the number of fetched rows is less than `batch_size` or if the fetch failed,
123
+ the generator stops producing tasks.
124
+
125
+ Args:
126
+ result (SayouPacket): The result from the Fetcher.
127
+ """
128
+ # 1. 실패했거나 데이터가 없으면 종료
129
+ if not result.success or not result.data:
130
+ self._log("No data returned or fetch failed. Stopping.", level="warning")
131
+ self.stop_flag = True
132
+ return
133
+
134
+ # 2. 가져온 데이터가 배치 사이즈보다 작으면 '마지막 페이지'임
135
+ rows = result.data
136
+ if isinstance(rows, list) and len(rows) < self.batch_size:
137
+ self._log(
138
+ f"Reached end of records (Fetched {len(rows)} < Batch {self.batch_size})."
139
+ )
140
+ self.stop_flag = True
@@ -0,0 +1,81 @@
1
+ from abc import abstractmethod
2
+ from typing import Any
3
+
4
+ from sayou.core.base_component import BaseComponent
5
+ from sayou.core.decorators import measure_time, retry
6
+ from sayou.core.schemas import SayouPacket, SayouTask
7
+
8
+ from ..core.exceptions import FetcherError
9
+
10
+
11
+ class BaseFetcher(BaseComponent):
12
+ """
13
+ (Tier 1) Abstract base class for all data fetchers.
14
+
15
+ This class implements the Template Method pattern. It handles common logic
16
+ like logging, error wrapping, and retries in `fetch()`, while delegating
17
+ the actual retrieval logic to `_do_fetch()`.
18
+ """
19
+
20
+ component_name = "BaseFetcher"
21
+ SUPPORTED_TYPES = []
22
+
23
+ @classmethod
24
+ def can_handle(cls, uri: str) -> float:
25
+ """
26
+ Evaluates whether this fetcher can handle the specific Task URI.
27
+ """
28
+ return 0.0
29
+
30
+ @measure_time
31
+ @retry(max_retries=3, delay=1.0)
32
+ def fetch(self, task: SayouTask) -> SayouPacket:
33
+ """
34
+ Execute the fetching process for a given task.
35
+
36
+ This method wraps the actual fetching logic with error handling and logging.
37
+ It guarantees to return a SayouPacket, even if the operation fails.
38
+
39
+ Args:
40
+ task (SayouTask): The task definition containing the URI and parameters.
41
+
42
+ Returns:
43
+ SayouPacket: A packet containing the fetched data (on success)
44
+ or error details (on failure).
45
+ """
46
+ self._emit("on_start", input_data=task)
47
+ self._log(f"Fetching: {task.uri} ({task.source_type})", level="debug")
48
+
49
+ try:
50
+ data = self._do_fetch(task)
51
+ packet = SayouPacket(task=task, data=data, success=True)
52
+ self._emit("on_finish", result_data=packet, success=True)
53
+ return packet
54
+
55
+ except Exception as e:
56
+ self._emit("on_error", error=e)
57
+ wrapped_error = FetcherError(
58
+ f"[{self.component_name}] Failed to fetch: {str(e)}"
59
+ )
60
+ self.logger.error(wrapped_error, exc_info=True)
61
+
62
+ return SayouPacket(
63
+ task=task, data=None, success=False, error=str(wrapped_error)
64
+ )
65
+
66
+ @abstractmethod
67
+ def _do_fetch(self, task: SayouTask) -> Any:
68
+ """
69
+ [Abstract Hook] Implement the actual data retrieval logic here.
70
+
71
+ Args:
72
+ task (SayouTask): The task containing source URI and params.
73
+
74
+ Returns:
75
+ Any: The raw data retrieved (e.g., bytes, str, list).
76
+
77
+ Raises:
78
+ Exception: Raise any exception if retrieval fails.
79
+ The parent `fetch` method will catch and wrap it.
80
+ """
81
+ raise NotImplementedError