sayou-connector 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sayou/connector/__init__.py +11 -0
- sayou/connector/core/exceptions.py +38 -0
- sayou/connector/fetcher/file_fetcher.py +42 -0
- sayou/connector/fetcher/requests_fetcher.py +77 -0
- sayou/connector/fetcher/sqlite_fetcher.py +50 -0
- sayou/connector/generator/file_generator.py +124 -0
- sayou/connector/generator/requests_generator.py +113 -0
- sayou/connector/generator/sqlite_generator.py +140 -0
- sayou/connector/interfaces/base_fetcher.py +81 -0
- sayou/connector/interfaces/base_generator.py +99 -0
- sayou/connector/pipeline.py +304 -0
- sayou/connector/plugins/gmail_fetcher.py +127 -0
- sayou/connector/plugins/gmail_generator.py +79 -0
- sayou/connector/plugins/google_calendar_fetcher.py +89 -0
- sayou/connector/plugins/google_calendar_generator.py +46 -0
- sayou/connector/plugins/google_drive_fetcher.py +151 -0
- sayou/connector/plugins/google_drive_generator.py +107 -0
- sayou/connector/plugins/imap_email_fetcher.py +140 -0
- sayou/connector/plugins/imap_email_generator.py +93 -0
- sayou/connector/plugins/notion_fetcher.py +301 -0
- sayou/connector/plugins/notion_generator.py +73 -0
- sayou/connector/plugins/public_youtube_fetcher.py +134 -0
- sayou/connector/plugins/public_youtube_generator.py +60 -0
- sayou_connector-0.3.12.dist-info/METADATA +303 -0
- sayou_connector-0.3.12.dist-info/RECORD +26 -0
- sayou_connector-0.3.12.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .generator.file_generator import FileGenerator
|
|
2
|
+
from .generator.requests_generator import RequestsGenerator
|
|
3
|
+
from .generator.sqlite_generator import SqliteGenerator
|
|
4
|
+
from .pipeline import ConnectorPipeline
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ConnectorPipeline",
|
|
8
|
+
"FileGenerator",
|
|
9
|
+
"RequestsGenerator",
|
|
10
|
+
"SqliteGenerator",
|
|
11
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from sayou.core.exceptions import SayouCoreError
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ConnectorError(SayouCoreError):
|
|
5
|
+
"""
|
|
6
|
+
Base exception for all errors within the sayou-connector toolkit.
|
|
7
|
+
|
|
8
|
+
All specific exceptions in this module should inherit from this class
|
|
9
|
+
to allow catching connector-specific issues globally.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FetcherError(ConnectorError):
|
|
16
|
+
"""
|
|
17
|
+
Exception raised when a Fetcher fails to retrieve data.
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
- File not found on disk.
|
|
21
|
+
- HTTP connection timeout or 404/500 errors.
|
|
22
|
+
- Database connection failure.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GeneratorError(ConnectorError):
|
|
29
|
+
"""
|
|
30
|
+
Exception raised when a Generator fails to produce tasks.
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
- Invalid start path or configuration.
|
|
34
|
+
- Failure to parse a sitemap or initial seed page.
|
|
35
|
+
- Logic errors in the generation strategy.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
pass
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from sayou.core.registry import register_component
|
|
4
|
+
from sayou.core.schemas import SayouTask
|
|
5
|
+
|
|
6
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_component("fetcher")
|
|
10
|
+
class FileFetcher(BaseFetcher):
|
|
11
|
+
"""
|
|
12
|
+
Concrete implementation of BaseFetcher for local file systems.
|
|
13
|
+
|
|
14
|
+
This fetcher reads binary data directly from the path specified in `task.uri`.
|
|
15
|
+
It handles basic file I/O operations and raises wrapped exceptions if the file
|
|
16
|
+
is inaccessible or missing.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
component_name = "FileFetcher"
|
|
20
|
+
SUPPORTED_TYPES = ["file"]
|
|
21
|
+
|
|
22
|
+
def _do_fetch(self, task: SayouTask) -> bytes:
|
|
23
|
+
"""
|
|
24
|
+
Read a file from the local file system.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
task (SayouTask): The task containing the file path in `task.uri`.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
bytes: The raw binary content of the file.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
FileNotFoundError: If the file does not exist.
|
|
34
|
+
IOError: If the file cannot be read.
|
|
35
|
+
"""
|
|
36
|
+
file_path = task.uri
|
|
37
|
+
|
|
38
|
+
if not os.path.exists(file_path):
|
|
39
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
40
|
+
|
|
41
|
+
with open(file_path, "rb") as f:
|
|
42
|
+
return f.read()
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from urllib.parse import urljoin
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
except ImportError:
|
|
8
|
+
BeautifulSoup = None
|
|
9
|
+
|
|
10
|
+
from sayou.core.registry import register_component
|
|
11
|
+
from sayou.core.schemas import SayouTask
|
|
12
|
+
|
|
13
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_component("fetcher")
|
|
17
|
+
class RequestsFetcher(BaseFetcher):
|
|
18
|
+
"""
|
|
19
|
+
Concrete implementation of BaseFetcher for static web pages.
|
|
20
|
+
|
|
21
|
+
Retrieves HTML content via HTTP requests. It supports optional CSS selector
|
|
22
|
+
extraction (via `task.params['selectors']`) and automatically discovers
|
|
23
|
+
hyperlinks on the page to support the `WebCrawlGenerator` feedback loop.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
component_name = "RequestsFetcher"
|
|
27
|
+
SUPPORTED_TYPES = ["requests"]
|
|
28
|
+
|
|
29
|
+
def _do_fetch(self, task: SayouTask) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
Fetch a web page and extract data/links.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
task (SayouTask): The task containing the URL in `task.uri`.
|
|
35
|
+
`task.params` may contain 'selectors'.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
dict: A dictionary containing extracted text, raw preview,
|
|
39
|
+
and found links under '__found_links__'.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
requests.RequestException: For network-related errors.
|
|
43
|
+
ImportError: If BeautifulSoup is not installed.
|
|
44
|
+
"""
|
|
45
|
+
if not BeautifulSoup:
|
|
46
|
+
raise ImportError("BeautifulSoup4 not installed.")
|
|
47
|
+
|
|
48
|
+
headers = {"User-Agent": "Sayou-Connector/0.1.0"}
|
|
49
|
+
resp = requests.get(task.uri, headers=headers, timeout=10)
|
|
50
|
+
resp.raise_for_status()
|
|
51
|
+
|
|
52
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
53
|
+
extracted_data = {}
|
|
54
|
+
|
|
55
|
+
# 1. Selectors logic
|
|
56
|
+
selectors = task.params.get("selectors", {})
|
|
57
|
+
if selectors:
|
|
58
|
+
for key, sel in selectors.items():
|
|
59
|
+
el = soup.select(sel)
|
|
60
|
+
if el:
|
|
61
|
+
extracted_data[key] = "\n".join(
|
|
62
|
+
[e.get_text(strip=True) for e in el]
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if not extracted_data:
|
|
66
|
+
extracted_data["_raw_preview"] = resp.text[:200]
|
|
67
|
+
|
|
68
|
+
# 2. Link extraction logic
|
|
69
|
+
found_links = set()
|
|
70
|
+
for a in soup.find_all("a", href=True):
|
|
71
|
+
abs_link = urljoin(task.uri, a["href"])
|
|
72
|
+
if abs_link.startswith("http"):
|
|
73
|
+
found_links.add(abs_link)
|
|
74
|
+
|
|
75
|
+
extracted_data["__found_links__"] = list(found_links)
|
|
76
|
+
|
|
77
|
+
return extracted_data
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_component("fetcher")
|
|
11
|
+
class SqliteFetcher(BaseFetcher):
|
|
12
|
+
"""
|
|
13
|
+
Concrete implementation of BaseFetcher for SQLite databases.
|
|
14
|
+
|
|
15
|
+
Connects to the SQLite database file specified in `task.uri` and executes
|
|
16
|
+
the SQL query provided in `task.params['query']`. It manages connection
|
|
17
|
+
lifecycles using context managers and returns results as a list of dictionaries.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
component_name = "SqliteFetcher"
|
|
21
|
+
SUPPORTED_TYPES = ["sqlite"]
|
|
22
|
+
|
|
23
|
+
def _do_fetch(self, task: SayouTask) -> List[Dict[str, Any]]:
|
|
24
|
+
"""
|
|
25
|
+
Execute a SQL query against a SQLite database.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
task (SayouTask): The task containing the DB path in `task.uri`
|
|
29
|
+
and the SQL query in `task.params['query']`.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List[Dict[str, Any]]: A list of rows, where each row is a dictionary.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
sqlite3.Error: If the database connection or query execution fails.
|
|
36
|
+
"""
|
|
37
|
+
db_path = task.uri
|
|
38
|
+
query = task.params.get("query")
|
|
39
|
+
|
|
40
|
+
if not query:
|
|
41
|
+
raise ValueError("Query param is missing in SayouTask")
|
|
42
|
+
|
|
43
|
+
with sqlite3.connect(db_path) as conn:
|
|
44
|
+
conn.row_factory = sqlite3.Row
|
|
45
|
+
cursor = conn.cursor()
|
|
46
|
+
|
|
47
|
+
self._log(f"Executing query on {db_path}: {query[:50]}...", level="debug")
|
|
48
|
+
|
|
49
|
+
cursor.execute(query)
|
|
50
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import os
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
from sayou.core.registry import register_component
|
|
6
|
+
from sayou.core.schemas import SayouTask
|
|
7
|
+
|
|
8
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_component("generator")
|
|
12
|
+
class FileGenerator(BaseGenerator):
|
|
13
|
+
"""
|
|
14
|
+
Concrete implementation of BaseGenerator for file system traversal.
|
|
15
|
+
|
|
16
|
+
Scans a directory tree starting from a source path. It yields `SayouTask`s
|
|
17
|
+
for files that match specific criteria, such as file extensions or name patterns.
|
|
18
|
+
Supports both recursive and flat directory scanning.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
component_name = "FileGenerator"
|
|
22
|
+
SUPPORTED_TYPES = ["file"]
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def can_handle(cls, source: str) -> float:
|
|
26
|
+
"""
|
|
27
|
+
Evaluates whether this generator can handle the given source.
|
|
28
|
+
|
|
29
|
+
Analyzes the source string to determine if it matches the pattern or format
|
|
30
|
+
supported by this generator. Returns a confidence score between 0.0 and 1.0.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
source (str): The input source string to evaluate.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
float: A confidence score where 1.0 means full confidence,
|
|
37
|
+
0.0 means the source is incompatible, and intermediate values
|
|
38
|
+
indicate partial matches or heuristics.
|
|
39
|
+
"""
|
|
40
|
+
if os.path.exists(source):
|
|
41
|
+
return 1.0
|
|
42
|
+
if source.startswith("/") or source.startswith("./") or ":\\" in source:
|
|
43
|
+
return 0.8
|
|
44
|
+
return 0.0
|
|
45
|
+
|
|
46
|
+
def initialize(
|
|
47
|
+
self,
|
|
48
|
+
source: str,
|
|
49
|
+
recursive: bool = True,
|
|
50
|
+
extensions: list = None,
|
|
51
|
+
name_pattern: str = "*",
|
|
52
|
+
**kwargs
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Configure the file scanning strategy.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
source (str): The root directory or file path to start scanning.
|
|
59
|
+
recursive (bool): If True, scan subdirectories recursively.
|
|
60
|
+
extensions (Optional[List[str]]): List of allowed extensions (e.g., ['.pdf', '.txt']).
|
|
61
|
+
name_pattern (str): Glob pattern for filename matching (e.g., '*report*').
|
|
62
|
+
**kwargs: Ignored additional arguments.
|
|
63
|
+
"""
|
|
64
|
+
self.root_path = os.path.abspath(source)
|
|
65
|
+
self.recursive = recursive
|
|
66
|
+
self.extensions = [e.lower() for e in extensions] if extensions else None
|
|
67
|
+
self.name_pattern = name_pattern
|
|
68
|
+
|
|
69
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
70
|
+
"""
|
|
71
|
+
Walk through the file system and yield tasks for valid files.
|
|
72
|
+
|
|
73
|
+
Yields:
|
|
74
|
+
Iterator[SayouTask]: Tasks with `source_type='file'`.
|
|
75
|
+
"""
|
|
76
|
+
if os.path.isfile(self.root_path):
|
|
77
|
+
if self._is_valid(self.root_path):
|
|
78
|
+
yield self._create_task(self.root_path)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
walker = (
|
|
82
|
+
os.walk(self.root_path)
|
|
83
|
+
if self.recursive
|
|
84
|
+
else [(self.root_path, [], os.listdir(self.root_path))]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for root, _, files in walker:
|
|
88
|
+
for file in files:
|
|
89
|
+
full_path = os.path.join(root, file)
|
|
90
|
+
if self._is_valid(file):
|
|
91
|
+
yield self._create_task(full_path)
|
|
92
|
+
|
|
93
|
+
def _is_valid(self, filename: str) -> bool:
|
|
94
|
+
"""
|
|
95
|
+
Check if a file matches the extension and name pattern criteria.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
filename (str): The name of the file to check.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
bool: True if the file should be processed, False otherwise.
|
|
102
|
+
"""
|
|
103
|
+
if not fnmatch.fnmatch(filename, self.name_pattern):
|
|
104
|
+
return False
|
|
105
|
+
if (
|
|
106
|
+
self.extensions
|
|
107
|
+
and os.path.splitext(filename)[1].lower() not in self.extensions
|
|
108
|
+
):
|
|
109
|
+
return False
|
|
110
|
+
return True
|
|
111
|
+
|
|
112
|
+
def _create_task(self, path: str) -> SayouTask:
|
|
113
|
+
"""
|
|
114
|
+
Create a SayouTask for a valid file path.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
path (str): The absolute path to the file.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
SayouTask: The configured task object.
|
|
121
|
+
"""
|
|
122
|
+
return SayouTask(
|
|
123
|
+
source_type="file", uri=path, meta={"filename": os.path.basename(path)}
|
|
124
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import deque
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
from sayou.core.registry import register_component
|
|
6
|
+
from sayou.core.schemas import SayouPacket, SayouTask
|
|
7
|
+
|
|
8
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_component("generator")
|
|
12
|
+
class RequestsGenerator(BaseGenerator):
|
|
13
|
+
"""
|
|
14
|
+
Concrete implementation of BaseGenerator for web crawling.
|
|
15
|
+
|
|
16
|
+
Manages a frontier queue of URLs to visit. It starts from a seed URL and
|
|
17
|
+
dynamically adds new targets based on links discovered by the Fetcher (Feedback),
|
|
18
|
+
respecting maximum depth and URL pattern constraints.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
component_name = "RequestsGenerator"
|
|
22
|
+
SUPPORTED_TYPES = ["requests"]
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def can_handle(cls, source: str) -> float:
|
|
26
|
+
"""
|
|
27
|
+
Evaluates whether this generator can handle the given source.
|
|
28
|
+
|
|
29
|
+
Analyzes the source string to determine if it matches the pattern or format
|
|
30
|
+
supported by this generator. Returns a confidence score between 0.0 and 1.0.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
source (str): The input source string to evaluate.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
float: A confidence score where 1.0 means full confidence,
|
|
37
|
+
0.0 means the source is incompatible, and intermediate values
|
|
38
|
+
indicate partial matches or heuristics.
|
|
39
|
+
"""
|
|
40
|
+
s = source.strip().lower()
|
|
41
|
+
|
|
42
|
+
if s.startswith("http://") or s.startswith("https://"):
|
|
43
|
+
return 1.0
|
|
44
|
+
|
|
45
|
+
if s.startswith("www."):
|
|
46
|
+
return 0.8
|
|
47
|
+
|
|
48
|
+
return 0.0
|
|
49
|
+
|
|
50
|
+
def initialize(
|
|
51
|
+
self,
|
|
52
|
+
source: str,
|
|
53
|
+
link_pattern: str = ".*",
|
|
54
|
+
selectors: dict = None,
|
|
55
|
+
max_depth: int = 1,
|
|
56
|
+
**kwargs,
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Configure the web crawling strategy.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
source (str): The seed URL to start crawling.
|
|
63
|
+
link_pattern (str): Regex pattern to filter links to follow.
|
|
64
|
+
selectors (Optional[dict]): CSS selectors to extract specific data from pages.
|
|
65
|
+
max_depth (int): Maximum depth to traverse from the seed URL.
|
|
66
|
+
**kwargs: Ignored additional arguments.
|
|
67
|
+
"""
|
|
68
|
+
self.queue = deque([(source, 0)])
|
|
69
|
+
self.visited = {source}
|
|
70
|
+
self.link_regex = re.compile(link_pattern)
|
|
71
|
+
self.selectors = selectors or {}
|
|
72
|
+
self.max_depth = max_depth
|
|
73
|
+
|
|
74
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
75
|
+
"""
|
|
76
|
+
Yield tasks from the crawling queue.
|
|
77
|
+
|
|
78
|
+
Yields:
|
|
79
|
+
Iterator[SayouTask]: Tasks for URLs in the queue.
|
|
80
|
+
"""
|
|
81
|
+
while self.queue:
|
|
82
|
+
url, depth = self.queue.popleft()
|
|
83
|
+
yield SayouTask(
|
|
84
|
+
source_type="requests",
|
|
85
|
+
uri=url,
|
|
86
|
+
params={"selectors": self.selectors, "depth": depth},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def _do_feedback(self, result: SayouPacket):
|
|
90
|
+
"""
|
|
91
|
+
Extract links from the fetched page and add them to the queue.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
result (SayouPacket): The result containing extracted links ('__found_links__').
|
|
95
|
+
"""
|
|
96
|
+
if not result.success or not result.data:
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
current_depth = result.task.params.get("depth", 0)
|
|
100
|
+
if current_depth >= self.max_depth:
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
links = result.data.get("__found_links__", [])
|
|
104
|
+
new_links = 0
|
|
105
|
+
|
|
106
|
+
for link in links:
|
|
107
|
+
if link not in self.visited and self.link_regex.search(link):
|
|
108
|
+
self.visited.add(link)
|
|
109
|
+
self.queue.append((link, current_depth + 1))
|
|
110
|
+
new_links += 1
|
|
111
|
+
|
|
112
|
+
if new_links > 0:
|
|
113
|
+
self._log(f"Added {new_links} new links (Depth {current_depth+1})")
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Iterator
|
|
3
|
+
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouPacket, SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_component("generator")
|
|
11
|
+
class SqliteGenerator(BaseGenerator):
|
|
12
|
+
"""
|
|
13
|
+
Concrete implementation of BaseGenerator for SQL pagination.
|
|
14
|
+
|
|
15
|
+
Generates a sequence of database query tasks using LIMIT and OFFSET strategies.
|
|
16
|
+
It continues to yield tasks by incrementing the offset until the Fetcher returns
|
|
17
|
+
an empty result or a partial batch, indicating the end of the dataset.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
component_name = "SqliteGenerator"
|
|
21
|
+
SUPPORTED_TYPES = ["sqlite"]
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def can_handle(cls, source: str) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Evaluates whether this generator can handle the given source.
|
|
27
|
+
|
|
28
|
+
Analyzes the source string to determine if it matches the pattern or format
|
|
29
|
+
supported by this generator. Returns a confidence score between 0.0 and 1.0.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
source (str): The input source string to evaluate.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
float: A confidence score where 1.0 means full confidence,
|
|
36
|
+
0.0 means the source is incompatible, and intermediate values
|
|
37
|
+
indicate partial matches or heuristics.
|
|
38
|
+
"""
|
|
39
|
+
s = source.strip()
|
|
40
|
+
|
|
41
|
+
if s.lower().startswith("sqlite:///"):
|
|
42
|
+
return 1.0
|
|
43
|
+
|
|
44
|
+
if any(s.lower().endswith(ext) for ext in [".db", ".sqlite", ".sqlite3"]):
|
|
45
|
+
if os.path.isfile(s):
|
|
46
|
+
return 1.0
|
|
47
|
+
return 0.9
|
|
48
|
+
|
|
49
|
+
return 0.0
|
|
50
|
+
|
|
51
|
+
def initialize(
|
|
52
|
+
self,
|
|
53
|
+
source: str,
|
|
54
|
+
query: str = None,
|
|
55
|
+
batch_size: int = 1000,
|
|
56
|
+
**kwargs,
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Configure the SQL scanning strategy with pagination.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
source (str): The database connection string or file path.
|
|
63
|
+
query (str): The base SQL query (without LIMIT/OFFSET).
|
|
64
|
+
batch_size (int): Number of rows to fetch per task.
|
|
65
|
+
**kwargs: Ignored additional arguments.
|
|
66
|
+
"""
|
|
67
|
+
self.conn_str = self._clean_source(source)
|
|
68
|
+
self.base_query = (
|
|
69
|
+
query.strip().rstrip(";")
|
|
70
|
+
if query
|
|
71
|
+
else "SELECT name FROM sqlite_master WHERE type='table'"
|
|
72
|
+
)
|
|
73
|
+
self.batch_size = batch_size
|
|
74
|
+
self.current_offset = 0
|
|
75
|
+
self.stop_flag = False
|
|
76
|
+
|
|
77
|
+
def _clean_source(self, source: str) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Extracts the actual file path from a source URI.
|
|
80
|
+
|
|
81
|
+
Removes prefixes like 'sqlite:///' or 'sqlite://' from the source string
|
|
82
|
+
to return a clean file path compatible with the standard `sqlite3` connect method.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
source (str): The input source string (e.g., 'sqlite:///data/test.db').
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
str: The cleaned file path (e.g., '/data/test.db').
|
|
89
|
+
"""
|
|
90
|
+
s = source.strip()
|
|
91
|
+
if s.lower().startswith("sqlite:///"):
|
|
92
|
+
return s[10:]
|
|
93
|
+
elif s.lower().startswith("sqlite://"):
|
|
94
|
+
return s[9:]
|
|
95
|
+
return s
|
|
96
|
+
|
|
97
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
98
|
+
"""
|
|
99
|
+
Yield pagination tasks until the stop flag is set.
|
|
100
|
+
|
|
101
|
+
Yields:
|
|
102
|
+
Iterator[SayouTask]: Tasks with `source_type='sqlite'` and pagination params.
|
|
103
|
+
"""
|
|
104
|
+
while not self.stop_flag:
|
|
105
|
+
paginated_query = f"{self.base_query} LIMIT {self.batch_size} OFFSET {self.current_offset}"
|
|
106
|
+
|
|
107
|
+
task = SayouTask(
|
|
108
|
+
source_type="sqlite",
|
|
109
|
+
uri=self.conn_str,
|
|
110
|
+
params={"query": paginated_query},
|
|
111
|
+
meta={"offset": self.current_offset, "batch": self.batch_size},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
yield task
|
|
115
|
+
|
|
116
|
+
self.current_offset += self.batch_size
|
|
117
|
+
|
|
118
|
+
def _do_feedback(self, result: SayouPacket):
|
|
119
|
+
"""
|
|
120
|
+
Determine if pagination should stop based on the fetch result.
|
|
121
|
+
|
|
122
|
+
If the number of fetched rows is less than `batch_size` or if the fetch failed,
|
|
123
|
+
the generator stops producing tasks.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
result (SayouPacket): The result from the Fetcher.
|
|
127
|
+
"""
|
|
128
|
+
# 1. 실패했거나 데이터가 없으면 종료
|
|
129
|
+
if not result.success or not result.data:
|
|
130
|
+
self._log("No data returned or fetch failed. Stopping.", level="warning")
|
|
131
|
+
self.stop_flag = True
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
# 2. 가져온 데이터가 배치 사이즈보다 작으면 '마지막 페이지'임
|
|
135
|
+
rows = result.data
|
|
136
|
+
if isinstance(rows, list) and len(rows) < self.batch_size:
|
|
137
|
+
self._log(
|
|
138
|
+
f"Reached end of records (Fetched {len(rows)} < Batch {self.batch_size})."
|
|
139
|
+
)
|
|
140
|
+
self.stop_flag = True
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from sayou.core.base_component import BaseComponent
|
|
5
|
+
from sayou.core.decorators import measure_time, retry
|
|
6
|
+
from sayou.core.schemas import SayouPacket, SayouTask
|
|
7
|
+
|
|
8
|
+
from ..core.exceptions import FetcherError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseFetcher(BaseComponent):
|
|
12
|
+
"""
|
|
13
|
+
(Tier 1) Abstract base class for all data fetchers.
|
|
14
|
+
|
|
15
|
+
This class implements the Template Method pattern. It handles common logic
|
|
16
|
+
like logging, error wrapping, and retries in `fetch()`, while delegating
|
|
17
|
+
the actual retrieval logic to `_do_fetch()`.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
component_name = "BaseFetcher"
|
|
21
|
+
SUPPORTED_TYPES = []
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def can_handle(cls, uri: str) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Evaluates whether this fetcher can handle the specific Task URI.
|
|
27
|
+
"""
|
|
28
|
+
return 0.0
|
|
29
|
+
|
|
30
|
+
@measure_time
|
|
31
|
+
@retry(max_retries=3, delay=1.0)
|
|
32
|
+
def fetch(self, task: SayouTask) -> SayouPacket:
|
|
33
|
+
"""
|
|
34
|
+
Execute the fetching process for a given task.
|
|
35
|
+
|
|
36
|
+
This method wraps the actual fetching logic with error handling and logging.
|
|
37
|
+
It guarantees to return a SayouPacket, even if the operation fails.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
task (SayouTask): The task definition containing the URI and parameters.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
SayouPacket: A packet containing the fetched data (on success)
|
|
44
|
+
or error details (on failure).
|
|
45
|
+
"""
|
|
46
|
+
self._emit("on_start", input_data=task)
|
|
47
|
+
self._log(f"Fetching: {task.uri} ({task.source_type})", level="debug")
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
data = self._do_fetch(task)
|
|
51
|
+
packet = SayouPacket(task=task, data=data, success=True)
|
|
52
|
+
self._emit("on_finish", result_data=packet, success=True)
|
|
53
|
+
return packet
|
|
54
|
+
|
|
55
|
+
except Exception as e:
|
|
56
|
+
self._emit("on_error", error=e)
|
|
57
|
+
wrapped_error = FetcherError(
|
|
58
|
+
f"[{self.component_name}] Failed to fetch: {str(e)}"
|
|
59
|
+
)
|
|
60
|
+
self.logger.error(wrapped_error, exc_info=True)
|
|
61
|
+
|
|
62
|
+
return SayouPacket(
|
|
63
|
+
task=task, data=None, success=False, error=str(wrapped_error)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def _do_fetch(self, task: SayouTask) -> Any:
|
|
68
|
+
"""
|
|
69
|
+
[Abstract Hook] Implement the actual data retrieval logic here.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
task (SayouTask): The task containing source URI and params.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Any: The raw data retrieved (e.g., bytes, str, list).
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
Exception: Raise any exception if retrieval fails.
|
|
79
|
+
The parent `fetch` method will catch and wrap it.
|
|
80
|
+
"""
|
|
81
|
+
raise NotImplementedError
|