sayou-connector 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ import os
2
+ from typing import Iterator
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+
10
+ @register_component("generator")
11
+ class GoogleCalendarGenerator(BaseGenerator):
12
+ """
13
+ Generates tasks using OAuth 2.0 Token.
14
+ Requires 'sayou_google_token.json' (generated by auth script).
15
+ """
16
+
17
+ component_name = "GoogleCalendarGenerator"
18
+ SUPPORTED_TYPES = ["google_calendar"]
19
+
20
+ @classmethod
21
+ def can_handle(cls, uri: str) -> float:
22
+ return 1.0 if uri.startswith("gcal://") else 0.0
23
+
24
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
25
+ token_path = kwargs.get("google_token_path")
26
+
27
+ if not os.path.exists(token_path):
28
+ raise FileNotFoundError(
29
+ f"Google Token not found at {token_path}. Run authentication script first."
30
+ )
31
+
32
+ calendar_id = source.replace("gcal://", "") or "primary"
33
+
34
+ yield SayouTask(
35
+ uri=source,
36
+ source_type="google_calendar",
37
+ params={
38
+ "calendar_id": calendar_id,
39
+ "token_path": token_path,
40
+ },
41
+ meta={
42
+ "source": "google_calendar",
43
+ "calendar_id": calendar_id,
44
+ "filename": f"calendar_{calendar_id.replace('@', '_')}",
45
+ },
46
+ )
@@ -0,0 +1,151 @@
1
+ import io
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ from sayou.core.registry import register_component
6
+ from sayou.core.schemas import SayouTask
7
+
8
+ from ..interfaces.base_fetcher import BaseFetcher
9
+
10
+ try:
11
+ import chardet
12
+ from google.oauth2.credentials import Credentials
13
+ from googleapiclient.discovery import build
14
+ from googleapiclient.errors import HttpError
15
+ from googleapiclient.http import MediaIoBaseDownload
16
+ except ImportError:
17
+ build = None
18
+ chardet = None
19
+
20
+
21
+ @register_component("fetcher")
22
+ class GoogleDriveFetcher(BaseFetcher):
23
+ """
24
+ Fetches content from Google Drive files.
25
+ - Google Native Formats -> Converted to MS Office formats (.docx, .xlsx, .pptx)
26
+ - Standard Files (PDF, JPG, ZIP...) -> Downloaded as original binary.
27
+ """
28
+
29
+ component_name = "GoogleDriveFetcher"
30
+ SUPPORTED_TYPES = ["drive"]
31
+
32
+ @classmethod
33
+ def can_handle(cls, uri: str) -> float:
34
+ return 1.0 if uri.startswith("gdrive://file/") else 0.0
35
+
36
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
37
+ token_path = task.params.get("token_path")
38
+ file_id = task.params.get("file_id")
39
+ mime_type = task.params.get("mime_type")
40
+ original_name = task.meta.get("filename", "unknown_file")
41
+
42
+ creds = Credentials.from_authorized_user_file(token_path)
43
+ service = build("drive", "v3", credentials=creds)
44
+
45
+ request = None
46
+ extension = ""
47
+ is_google_doc = False
48
+
49
+ # 1. Google Native Formats
50
+ if mime_type == "application/vnd.google-apps.document":
51
+ request = service.files().export_media(
52
+ fileId=file_id,
53
+ mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
54
+ )
55
+ extension = ".docx"
56
+ is_google_doc = True
57
+ elif mime_type == "application/vnd.google-apps.spreadsheet":
58
+ request = service.files().export_media(
59
+ fileId=file_id,
60
+ mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
61
+ )
62
+ extension = ".xlsx"
63
+ is_google_doc = True
64
+ elif mime_type == "application/vnd.google-apps.presentation":
65
+ request = service.files().export_media(
66
+ fileId=file_id,
67
+ mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
68
+ )
69
+ extension = ".pptx"
70
+ is_google_doc = True
71
+ else:
72
+ request = service.files().get_media(fileId=file_id)
73
+ _, ext = os.path.splitext(original_name)
74
+ extension = ext if ext else ""
75
+
76
+ # 2. Execute Download
77
+ try:
78
+ fh = io.BytesIO()
79
+ downloader = MediaIoBaseDownload(fh, request)
80
+ done = False
81
+ while done is False:
82
+ status, done = downloader.next_chunk()
83
+
84
+ raw_bytes = fh.getvalue()
85
+
86
+ final_content = raw_bytes
87
+ is_text_candidate = False
88
+
89
+ if mime_type.startswith("text/") or mime_type == "application/json":
90
+ is_text_candidate = True
91
+ elif extension.lower() in [
92
+ ".csv",
93
+ ".txt",
94
+ ".json",
95
+ ".md",
96
+ ".py",
97
+ ".html",
98
+ ".xml",
99
+ ]:
100
+ is_text_candidate = True
101
+
102
+ if not is_google_doc and is_text_candidate:
103
+ detected = chardet.detect(raw_bytes)
104
+ encoding = detected.get("encoding")
105
+ confidence = detected.get("confidence", 0)
106
+
107
+ # 2) EUC-KR -> UTF-8
108
+ if (
109
+ encoding
110
+ and encoding.lower() not in ["utf-8", "ascii"]
111
+ and confidence > 0.6
112
+ ):
113
+ try:
114
+ # Decode (Bytes -> Str)
115
+ text_content = raw_bytes.decode(encoding)
116
+ # Encode back to Bytes (Str -> UTF-8 Bytes)
117
+ final_content = text_content.encode("utf-8")
118
+ self._log(
119
+ f"Transcoded {original_name} from {encoding} to utf-8 bytes."
120
+ )
121
+ except Exception as e:
122
+ self._log(
123
+ f"Encoding conversion failed: {e}. Keeping raw bytes.",
124
+ level="warning",
125
+ )
126
+ final_content = raw_bytes
127
+
128
+ # 3. Return
129
+ return {
130
+ "content": final_content,
131
+ "meta": {
132
+ "source": "google_drive",
133
+ "file_id": file_id,
134
+ "mime_type": mime_type,
135
+ "original_filename": original_name,
136
+ "suggested_filename": (
137
+ f"{original_name}{extension}"
138
+ if not original_name.endswith(extension)
139
+ else original_name
140
+ ),
141
+ "extension": extension,
142
+ "is_binary": isinstance(final_content, bytes),
143
+ },
144
+ }
145
+
146
+ except HttpError as e:
147
+ self._log(f"Drive Download Failed ({file_id}): {e}", level="error")
148
+ return {
149
+ "content": b"",
150
+ "meta": {"source": "google_drive", "error": str(e), "file_id": file_id},
151
+ }
@@ -0,0 +1,107 @@
1
+ import os
2
+ from typing import Iterator
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+ try:
10
+ from google.oauth2.credentials import Credentials
11
+ from googleapiclient.discovery import build
12
+ except ImportError:
13
+ build = None
14
+
15
+
16
+ @register_component("generator")
17
+ class GoogleDriveGenerator(BaseGenerator):
18
+ """
19
+ Generates tasks for files in Google Drive.
20
+ URI Schema:
21
+ - gdrive://root (My Drive Root)
22
+ - gdrive://{folderID} (Specific Folder)
23
+ """
24
+
25
+ component_name = "GoogleDriveGenerator"
26
+ SUPPORTED_TYPES = ["drive"]
27
+
28
+ @classmethod
29
+ def can_handle(cls, uri: str) -> float:
30
+ return 1.0 if uri.startswith("gdrive://") else 0.0
31
+
32
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
33
+ # 1. Certification
34
+ token_path = kwargs.get("google_token_path")
35
+
36
+ if not os.path.exists(token_path):
37
+ raise FileNotFoundError(
38
+ f"Google Token not found at {token_path}. Run authentication script first."
39
+ )
40
+
41
+ creds = Credentials.from_authorized_user_file(token_path)
42
+ service = build("drive", "v3", credentials=creds)
43
+
44
+ # 2. Search Query
45
+ root_id = source.replace("gdrive://", "") or "root"
46
+
47
+ query_override = None
48
+ if "?" in root_id:
49
+ root_id, query_part = root_id.split("?", 1)
50
+
51
+ # 3. File Search (Recursive or Flat Search)
52
+ query = f"'{root_id}' in parents and trashed = false"
53
+ if root_id == "root":
54
+ pass
55
+
56
+ results = (
57
+ service.files()
58
+ .list(
59
+ q=query,
60
+ pageSize=100,
61
+ fields="nextPageToken, files(id, name, mimeType, webViewLink, createdTime, modifiedTime)",
62
+ )
63
+ .execute()
64
+ )
65
+
66
+ files = results.get("files", [])
67
+
68
+ for file in files:
69
+ mime_type = file.get("mimeType")
70
+ file_id = file["id"]
71
+ file_name = file["name"]
72
+
73
+ if mime_type == "application/vnd.google-apps.document":
74
+ target_uri = f"gdocs://document/{file_id}"
75
+ source = "docs"
76
+
77
+ elif mime_type == "application/vnd.google-apps.spreadsheet":
78
+ target_uri = f"gsheets://spreadsheet/{file_id}"
79
+ source = "sheets"
80
+
81
+ elif mime_type == "application/vnd.google-apps.presentation":
82
+ target_uri = f"gslides://presentation/{file_id}"
83
+ source = "slides"
84
+
85
+ elif mime_type == "application/vnd.google-apps.folder":
86
+ continue
87
+
88
+ else:
89
+ target_uri = f"gdrive://file/{file_id}"
90
+ source = "drive"
91
+
92
+ yield SayouTask(
93
+ uri=target_uri,
94
+ source_type=source,
95
+ params={
96
+ "file_id": file_id,
97
+ "mime_type": mime_type,
98
+ "token_path": token_path,
99
+ },
100
+ meta={
101
+ "source": source,
102
+ "filename": file_name,
103
+ "file_id": file_id,
104
+ "mime_type": mime_type,
105
+ "link": file.get("webViewLink"),
106
+ },
107
+ )
@@ -0,0 +1,140 @@
1
+ import email
2
+ import imaplib
3
+ from email.header import decode_header
4
+ from typing import Any, Dict
5
+
6
+ from sayou.core.registry import register_component
7
+ from sayou.core.schemas import SayouTask
8
+
9
+ from ..interfaces.base_fetcher import BaseFetcher
10
+
11
+ try:
12
+ import html2text
13
+ except ImportError:
14
+ html2text = None
15
+
16
+
17
+ @register_component("fetcher")
18
+ class ImapEmailFetcher(BaseFetcher):
19
+ """
20
+ Fetches a specific email body from ANY IMAP server and converts it to HTML.
21
+ """
22
+
23
+ component_name = "ImapEmailFetcher"
24
+ SUPPORTED_TYPES = ["imap", "email"]
25
+
26
+ @classmethod
27
+ def can_handle(cls, uri: str) -> float:
28
+ return 1.0 if uri.startswith("imap-msg://") else 0.0
29
+
30
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
31
+ """
32
+ Reconnects -> Fetches UID -> Parses -> Returns HTML String.
33
+ """
34
+ params = task.params
35
+ uid = params["uid"]
36
+ folder = params.get("folder", "INBOX")
37
+ imap_server = params.get("imap_server")
38
+
39
+ if not imap_server:
40
+ imap_server = "imap.gmail.com"
41
+
42
+ mail = imaplib.IMAP4_SSL(imap_server)
43
+
44
+ try:
45
+ mail.login(params["username"], params["password"])
46
+ mail.select(folder)
47
+
48
+ status, msg_data = mail.fetch(uid, "(RFC822)")
49
+
50
+ if status != "OK" or not msg_data:
51
+ raise ValueError(
52
+ f"Email UID {uid} not found or fetch failed on {imap_server}."
53
+ )
54
+
55
+ raw_email = msg_data[0][1]
56
+ msg = email.message_from_bytes(raw_email)
57
+
58
+ parsed_content = self._parse_email(msg)
59
+
60
+ html_doc = f"""
61
+ <!DOCTYPE html>
62
+ <html>
63
+ <head>
64
+ <title>{parsed_content['subject']}</title>
65
+ <meta name="sender" content="{parsed_content['sender']}">
66
+ <meta name="date" content="{parsed_content['date']}">
67
+ <meta name="uid" content="{uid}">
68
+ <meta name="source" content="imap">
69
+ <meta name="server" content="{imap_server}">
70
+ </head>
71
+ <body>
72
+ {parsed_content['body']}
73
+ </body>
74
+ </html>
75
+ """
76
+
77
+ return html_doc.strip()
78
+
79
+ except Exception as e:
80
+ raise RuntimeError(f"Failed to fetch email from {imap_server}: {e}")
81
+
82
+ finally:
83
+ try:
84
+ mail.logout()
85
+ except:
86
+ pass
87
+
88
+ def _parse_email(self, msg) -> Dict[str, Any]:
89
+ subject = self._decode_header(msg["Subject"])
90
+ sender = self._decode_header(msg["From"])
91
+ date = msg["Date"]
92
+
93
+ body_content = ""
94
+ html_found = False
95
+
96
+ if msg.is_multipart():
97
+ for part in msg.walk():
98
+ ctype = part.get_content_type()
99
+ payload = part.get_payload(decode=True)
100
+
101
+ if not payload:
102
+ continue
103
+
104
+ try:
105
+ text = payload.decode(
106
+ part.get_content_charset() or "utf-8", errors="ignore"
107
+ )
108
+ except:
109
+ text = payload.decode("utf-8", errors="ignore")
110
+
111
+ if ctype == "text/html":
112
+ body_content = text
113
+ html_found = True
114
+
115
+ elif ctype == "text/plain":
116
+ if not html_found:
117
+ body_content = text
118
+
119
+ else:
120
+ body_content = msg.get_payload(decode=True).decode(errors="ignore")
121
+
122
+ return {
123
+ "subject": subject,
124
+ "sender": sender,
125
+ "date": date,
126
+ "body": body_content,
127
+ }
128
+
129
+ def _decode_header(self, header_text):
130
+ """Decodes MIME headers (e.g., =?utf-8?b?...)"""
131
+ if not header_text:
132
+ return "(No Subject)"
133
+ decoded_list = decode_header(header_text)
134
+ text = ""
135
+ for bytes_str, encoding in decoded_list:
136
+ if isinstance(bytes_str, bytes):
137
+ text += bytes_str.decode(encoding or "utf-8", errors="ignore")
138
+ else:
139
+ text += str(bytes_str)
140
+ return text
@@ -0,0 +1,93 @@
1
+ import imaplib
2
+ from typing import Iterator
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+
10
+ @register_component("generator")
11
+ class ImapEmailGenerator(BaseGenerator):
12
+ """
13
+ Scans Generic IMAP inbox and generates tasks for individual emails.
14
+ Supports Gmail, Naver, Daum, Outlook, etc.
15
+ """
16
+
17
+ component_name = "ImapEmailGenerator"
18
+ SUPPORTED_TYPES = ["imap", "email"]
19
+
20
+ @classmethod
21
+ def can_handle(cls, source: str) -> float:
22
+ return 1.0 if source.startswith("imap://") else 0.0
23
+
24
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
25
+ """
26
+ Connects to IMAP Server -> Search -> Yield Tasks.
27
+ """
28
+ # 1. Parse connection information
29
+ parsed_host = source.replace("imap://", "").strip()
30
+ imap_server = (
31
+ parsed_host if parsed_host else kwargs.get("imap_server", "imap.gmail.com")
32
+ )
33
+
34
+ username = kwargs.get("username")
35
+ password = kwargs.get("password")
36
+
37
+ if not username or not password:
38
+ raise ValueError(
39
+ "IMAP credentials (username, password) required in kwargs."
40
+ )
41
+
42
+ folder = kwargs.get("folder", "INBOX")
43
+ limit = int(kwargs.get("limit", 10))
44
+
45
+ # 2. IMAP connection and search
46
+ mail = imaplib.IMAP4_SSL("imap.gmail.com")
47
+ try:
48
+ mail = imaplib.IMAP4_SSL(imap_server)
49
+ mail.login(username, password)
50
+ mail.select(folder)
51
+
52
+ # Search criteria (e.g., '(UNSEEN)' or 'ALL')
53
+ criteria = kwargs.get("search_criteria", "ALL")
54
+ status, messages = mail.search(None, criteria)
55
+
56
+ if status != "OK":
57
+ return
58
+
59
+ mail_ids = messages[0].split()
60
+ target_ids = mail_ids[-limit:]
61
+
62
+ self._log(
63
+ f"📧 [{imap_server}] Found {len(mail_ids)} emails. Generating tasks for last {len(target_ids)}."
64
+ )
65
+
66
+ # 3. Task generation (one task per email)
67
+ for b_id in reversed(target_ids):
68
+ uid = b_id.decode()
69
+
70
+ # Fetcher will process this internal protocol
71
+ task_uri = f"imap-msg://{imap_server}/{folder}/{uid}"
72
+
73
+ yield SayouTask(
74
+ uri=task_uri,
75
+ source_type="imap",
76
+ params={
77
+ "imap_server": imap_server,
78
+ "username": username,
79
+ "password": password,
80
+ "uid": uid,
81
+ "folder": folder,
82
+ },
83
+ meta={"source": "imap", "server": imap_server, "email_id": uid},
84
+ )
85
+
86
+ except Exception as e:
87
+ raise RuntimeError(f"IMAP connection failed to {imap_server}: {e}")
88
+ finally:
89
+ try:
90
+ mail.close()
91
+ mail.logout()
92
+ except:
93
+ pass