sayou-connector 0.3.7__tar.gz → 0.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/PKG-INFO +1 -1
  2. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/pyproject.toml +1 -1
  3. sayou_connector-0.3.16/src/sayou/connector/plugins/gmail_fetcher.py +128 -0
  4. sayou_connector-0.3.16/src/sayou/connector/plugins/gmail_generator.py +81 -0
  5. sayou_connector-0.3.16/src/sayou/connector/plugins/google_calendar_fetcher.py +89 -0
  6. sayou_connector-0.3.16/src/sayou/connector/plugins/google_calendar_generator.py +46 -0
  7. sayou_connector-0.3.16/src/sayou/connector/plugins/google_docs_fetcher.py +113 -0
  8. sayou_connector-0.3.16/src/sayou/connector/plugins/google_drive_fetcher.py +151 -0
  9. sayou_connector-0.3.16/src/sayou/connector/plugins/google_drive_generator.py +107 -0
  10. sayou_connector-0.3.16/src/sayou/connector/plugins/google_sheets_fetcher.py +85 -0
  11. sayou_connector-0.3.16/src/sayou/connector/plugins/google_slides_fetcher.py +99 -0
  12. sayou_connector-0.3.16/src/sayou/connector/plugins/google_youtube_fetcher.py +60 -0
  13. sayou_connector-0.3.16/src/sayou/connector/plugins/google_youtube_generator.py +86 -0
  14. sayou_connector-0.3.7/src/sayou/connector/plugins/gmail_fetcher.py → sayou_connector-0.3.16/src/sayou/connector/plugins/imap_email_fetcher.py +23 -10
  15. sayou_connector-0.3.7/src/sayou/connector/plugins/gmail_generator.py → sayou_connector-0.3.16/src/sayou/connector/plugins/imap_email_generator.py +24 -12
  16. sayou_connector-0.3.16/src/sayou/connector/plugins/public_youtube_fetcher.py +134 -0
  17. sayou_connector-0.3.16/src/sayou/connector/plugins/public_youtube_generator.py +60 -0
  18. sayou_connector-0.3.16/src/sayou/connector/plugins/trafilatura_fetcher.py +51 -0
  19. sayou_connector-0.3.16/src/sayou/connector/plugins/trafilatura_generator.py +32 -0
  20. sayou_connector-0.3.16/src/sayou/connector/plugins/wikipedia_fetcher.py +41 -0
  21. sayou_connector-0.3.16/src/sayou/connector/plugins/wikipedia_generator.py +43 -0
  22. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/.gitignore +0 -0
  23. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/README.md +0 -0
  24. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/examples/quick_start.ipynb +0 -0
  25. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/examples/quick_start.py +0 -0
  26. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/__init__.py +0 -0
  27. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/core/exceptions.py +0 -0
  28. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/fetcher/file_fetcher.py +0 -0
  29. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/fetcher/requests_fetcher.py +0 -0
  30. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/fetcher/sqlite_fetcher.py +0 -0
  31. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/generator/file_generator.py +0 -0
  32. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/generator/requests_generator.py +0 -0
  33. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/generator/sqlite_generator.py +0 -0
  34. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/interfaces/base_fetcher.py +0 -0
  35. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/interfaces/base_generator.py +0 -0
  36. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/pipeline.py +0 -0
  37. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/plugins/notion_fetcher.py +0 -0
  38. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/plugins/notion_generator.py +0 -0
  39. {sayou_connector-0.3.7 → sayou_connector-0.3.16}/tests/test_connector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sayou-connector
3
- Version: 0.3.7
3
+ Version: 0.3.16
4
4
  Summary: Connector components for the Sayou Data Platform
5
5
  Project-URL: Homepage, https://www.sayouzone.com/
6
6
  Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
@@ -7,7 +7,7 @@ build-backend = "hatchling.build"
7
7
  # -----------------
8
8
  [project]
9
9
  name = "sayou-connector"
10
- version = "0.3.7"
10
+ version = "0.3.16"
11
11
  authors = [
12
12
  { name = "Sayouzone", email = "contact@sayouzone.com" },
13
13
  ]
@@ -0,0 +1,128 @@
1
+ import base64
2
+ from typing import Any, Dict
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_fetcher import BaseFetcher
8
+
9
+ try:
10
+ from google.oauth2.credentials import Credentials
11
+ from googleapiclient.discovery import build
12
+ except ImportError:
13
+ build = None
14
+
15
+
16
+ @register_component("fetcher")
17
+ class GmailFetcher(BaseFetcher):
18
+ """
19
+ Fetches specific email content using Gmail API.
20
+ Reconstructs the email into a standardized HTML format suitable for Refinery.
21
+ """
22
+
23
+ component_name = "GmailFetcher"
24
+ SUPPORTED_TYPES = ["gmail"]
25
+
26
+ @classmethod
27
+ def can_handle(cls, uri: str) -> float:
28
+ return 1.0 if uri.startswith("gmail-msg://") else 0.0
29
+
30
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
31
+ token_path = task.params.get("token_path")
32
+ msg_id = task.params.get("msg_id")
33
+
34
+ if not build:
35
+ raise ImportError("Please install google-api-python-client")
36
+
37
+ creds = Credentials.from_authorized_user_file(token_path)
38
+ service = build("gmail", "v1", credentials=creds)
39
+
40
+ # 1. Fetch email details (format='full')
41
+ message = (
42
+ service.users()
43
+ .messages()
44
+ .get(userId="me", id=msg_id, format="full")
45
+ .execute()
46
+ )
47
+
48
+ payload = message.get("payload", {})
49
+ headers = payload.get("headers", [])
50
+
51
+ # 2. Parse headers (Subject, From, Date)
52
+ subject = self._get_header(headers, "Subject", "(No Subject)")
53
+ sender = self._get_header(headers, "From", "Unknown")
54
+ date = self._get_header(headers, "Date", "")
55
+
56
+ # 3. Extract body (Recursive)
57
+ body_content = self._extract_body(payload)
58
+
59
+ # 4. Reconstruct HTML (User Request Format)
60
+ html_doc = f"""<!DOCTYPE html>
61
+ <html>
62
+ <head>
63
+ <title>{subject}</title>
64
+ <meta name="sender" content="{sender}">
65
+ <meta name="date" content="{date}">
66
+ <meta name="msg_id" content="{msg_id}">
67
+ <meta name="source" content="gmail">
68
+ </head>
69
+ <body>
70
+ {body_content}
71
+ </body>
72
+ </html>"""
73
+
74
+ return html_doc.strip()
75
+
76
+ def _get_header(self, headers: list, name: str, default: str) -> str:
77
+ for h in headers:
78
+ if h["name"].lower() == name.lower():
79
+ return h["value"]
80
+ return default
81
+
82
+ def _extract_body(self, payload: dict) -> str:
83
+ body = ""
84
+
85
+ # Case A: Single Part
86
+ if "body" in payload and payload["body"].get("data"):
87
+ mime_type = payload.get("mimeType", "")
88
+ data = payload["body"]["data"]
89
+ decoded_text = self._decode_base64url(data)
90
+
91
+ if mime_type == "text/html":
92
+ return decoded_text
93
+ elif mime_type == "text/plain":
94
+ return f"<pre>{decoded_text}</pre>"
95
+
96
+ # Case B: Multi Part
97
+ if "parts" in payload:
98
+ html_part = None
99
+ text_part = None
100
+
101
+ for part in payload["parts"]:
102
+ mime_type = part.get("mimeType", "")
103
+
104
+ content = self._extract_body(part)
105
+
106
+ if mime_type == "text/html":
107
+ html_part = content
108
+ elif mime_type == "text/plain":
109
+ text_part = content
110
+ elif "multipart" in mime_type:
111
+ if content:
112
+ html_part = content
113
+
114
+ if html_part:
115
+ return html_part
116
+ if text_part:
117
+ return text_part
118
+
119
+ return body
120
+
121
+ def _decode_base64url(self, data: str) -> str:
122
+ try:
123
+ padding = len(data) % 4
124
+ if padding:
125
+ data += "=" * (4 - padding)
126
+ return base64.urlsafe_b64decode(data).decode("utf-8", errors="replace")
127
+ except Exception:
128
+ return ""
@@ -0,0 +1,81 @@
1
+ from typing import Iterator
2
+
3
+ from sayou.core.registry import register_component
4
+ from sayou.core.schemas import SayouTask
5
+
6
+ from ..interfaces.base_generator import BaseGenerator
7
+
8
+ try:
9
+ from google.oauth2.credentials import Credentials
10
+ from googleapiclient.discovery import build
11
+ except ImportError:
12
+ build = None
13
+
14
+
15
+ @register_component("generator")
16
+ class GmailGenerator(BaseGenerator):
17
+ """
18
+ Scans Gmail inbox using Gmail API (OAuth) and generates tasks.
19
+ """
20
+
21
+ component_name = "GmailGenerator"
22
+ SUPPORTED_TYPES = ["gmail"]
23
+
24
+ @classmethod
25
+ def can_handle(cls, source: str) -> float:
26
+ return 1.0 if source.startswith("gmail://") else 0.0
27
+
28
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
29
+ """
30
+ Connects to Gmail API -> Search (List) -> Yield Tasks.
31
+ source example: gmail://me (default) or gmail://me?q=is:unread
32
+ """
33
+ if not build:
34
+ raise ImportError(
35
+ "Please install google-api-python-client google-auth-oauthlib"
36
+ )
37
+
38
+ token_path = kwargs.get("token_path")
39
+ if not token_path:
40
+ raise ValueError("GmailGenerator requires 'token_path' in kwargs.")
41
+
42
+ # 1. Parsing Parameters
43
+ query = kwargs.get("query", "is:inbox")
44
+ max_results = int(kwargs.get("limit", 10))
45
+
46
+ # 2. Connect to Gmail API
47
+ creds = Credentials.from_authorized_user_file(token_path)
48
+ service = build("gmail", "v1", credentials=creds)
49
+
50
+ try:
51
+ # 3. Fetch email list
52
+ results = (
53
+ service.users()
54
+ .messages()
55
+ .list(userId="me", q=query, maxResults=max_results)
56
+ .execute()
57
+ )
58
+
59
+ messages = results.get("messages", [])
60
+
61
+ self._log(f"📧 Found {len(messages)} emails. Generating tasks...")
62
+
63
+ # 4. Generate tasks
64
+ for msg in messages:
65
+ msg_id = msg["id"]
66
+ thread_id = msg["threadId"]
67
+ task_uri = f"gmail-msg://{msg_id}"
68
+
69
+ yield SayouTask(
70
+ uri=task_uri,
71
+ source_type="gmail",
72
+ params={
73
+ "token_path": token_path,
74
+ "msg_id": msg_id,
75
+ "thread_id": thread_id,
76
+ },
77
+ )
78
+
79
+ except Exception as e:
80
+ self._log(f"Gmail API List failed: {e}", level="error")
81
+ raise e
@@ -0,0 +1,89 @@
1
+ import datetime
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ from sayou.core.registry import register_component
6
+ from sayou.core.schemas import SayouTask
7
+
8
+ from ..interfaces.base_fetcher import BaseFetcher
9
+
10
+ try:
11
+ from google.oauth2.credentials import Credentials
12
+ from googleapiclient.discovery import build
13
+ except ImportError:
14
+ build = None
15
+
16
+
17
+ @register_component("fetcher")
18
+ class GoogleCalendarFetcher(BaseFetcher):
19
+ """
20
+ Fetches events using Google API with User OAuth Token.
21
+ Works for Workspace (Corporate) & Personal accounts.
22
+ """
23
+
24
+ component_name = "GoogleCalendarFetcher"
25
+ SUPPORTED_TYPES = ["google_calendar"]
26
+
27
+ @classmethod
28
+ def can_handle(cls, uri: str) -> float:
29
+ return 1.0 if uri.startswith("gcal://") else 0.0
30
+
31
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
32
+ if not build:
33
+ raise ImportError("Google API libraries required.")
34
+
35
+ token_path = task.params.get("token_path")
36
+ if not token_path or not os.path.exists(token_path):
37
+ raise ValueError("Token path invalid.")
38
+
39
+ creds = Credentials.from_authorized_user_file(token_path)
40
+
41
+ service = build("calendar", "v3", credentials=creds)
42
+
43
+ now = datetime.datetime.utcnow()
44
+ time_min = (now - datetime.timedelta(days=30)).isoformat() + "Z"
45
+ time_max = (now + datetime.timedelta(days=30)).isoformat() + "Z"
46
+
47
+ events_result = (
48
+ service.events()
49
+ .list(
50
+ calendarId="primary",
51
+ timeMin=time_min,
52
+ timeMax=time_max,
53
+ singleEvents=True,
54
+ orderBy="startTime",
55
+ )
56
+ .execute()
57
+ )
58
+
59
+ events = events_result.get("items", [])
60
+
61
+ parsed_events = []
62
+ for event in events:
63
+ start = event["start"].get("dateTime", event["start"].get("date"))
64
+ end = event["end"].get("dateTime", event["end"].get("date"))
65
+
66
+ parsed_events.append(
67
+ {
68
+ "id": event.get("id"),
69
+ "summary": event.get("summary", "No Title"),
70
+ "description": event.get("description", ""),
71
+ "start": start,
72
+ "end": end,
73
+ "location": event.get("location", ""),
74
+ "htmlLink": event.get("htmlLink", ""),
75
+ "attendees": [
76
+ {"email": a.get("email"), "status": a.get("responseStatus")}
77
+ for a in event.get("attendees", [])
78
+ ],
79
+ }
80
+ )
81
+
82
+ return {
83
+ "content": parsed_events,
84
+ "meta": {
85
+ "source": "google_calendar",
86
+ "account": "authenticated_user",
87
+ "count": len(parsed_events),
88
+ },
89
+ }
@@ -0,0 +1,46 @@
1
+ import os
2
+ from typing import Iterator
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+
10
+ @register_component("generator")
11
+ class GoogleCalendarGenerator(BaseGenerator):
12
+ """
13
+ Generates tasks using OAuth 2.0 Token.
14
+ Requires 'sayou_google_token.json' (generated by auth script).
15
+ """
16
+
17
+ component_name = "GoogleCalendarGenerator"
18
+ SUPPORTED_TYPES = ["google_calendar"]
19
+
20
+ @classmethod
21
+ def can_handle(cls, uri: str) -> float:
22
+ return 1.0 if uri.startswith("gcal://") else 0.0
23
+
24
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
25
+ token_path = kwargs.get("google_token_path")
26
+
27
+ if not os.path.exists(token_path):
28
+ raise FileNotFoundError(
29
+ f"Google Token not found at {token_path}. Run authentication script first."
30
+ )
31
+
32
+ calendar_id = source.replace("gcal://", "") or "primary"
33
+
34
+ yield SayouTask(
35
+ uri=source,
36
+ source_type="google_calendar",
37
+ params={
38
+ "calendar_id": calendar_id,
39
+ "token_path": token_path,
40
+ },
41
+ meta={
42
+ "source": "google_calendar",
43
+ "calendar_id": calendar_id,
44
+ "filename": f"calendar_{calendar_id.replace('@', '_')}",
45
+ },
46
+ )
@@ -0,0 +1,113 @@
1
+ import html
2
+ import io
3
+ import re
4
+ from typing import Any, Dict
5
+
6
+ from sayou.core.registry import register_component
7
+ from sayou.core.schemas import SayouTask
8
+
9
+ from ..interfaces.base_fetcher import BaseFetcher
10
+
11
+ try:
12
+ from google.oauth2.credentials import Credentials
13
+ from googleapiclient.discovery import build
14
+ from googleapiclient.http import MediaIoBaseDownload
15
+ except ImportError:
16
+ build = None
17
+
18
+
19
+ @register_component("fetcher")
20
+ class GoogleDocsFetcher(BaseFetcher):
21
+ """
22
+ Fetches content from Google Docs using the Docs API (v1).
23
+ Extracts text and converts basic styling to Markdown.
24
+ Bypasses the 10MB export limit of Drive API.
25
+ """
26
+
27
+ component_name = "GoogleDocsFetcher"
28
+ SUPPORTED_TYPES = ["docs"]
29
+
30
+ @classmethod
31
+ def can_handle(cls, uri: str) -> float:
32
+ return 1.0 if uri.startswith("gdocs://document/") else 0.0
33
+
34
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
35
+ token_path = task.params.get("token_path")
36
+ doc_id = task.uri.replace("gdocs://document/", "").split("/")[0]
37
+ original_title = task.meta.get("filename", "Untitled")
38
+
39
+ fetch_mode = task.params.get("fetch_mode", "html")
40
+
41
+ creds = Credentials.from_authorized_user_file(token_path)
42
+
43
+ if fetch_mode == "html":
44
+ return self._fetch_as_html(creds, doc_id, original_title)
45
+ else:
46
+ return self._fetch_as_json(creds, doc_id, original_title)
47
+
48
+ # =========================================================
49
+ # Mode 1: JSON (Docs API)
50
+ # =========================================================
51
+ def _fetch_as_json(self, creds, doc_id, title) -> Dict[str, Any]:
52
+ try:
53
+ service = build("docs", "v1", credentials=creds)
54
+ document = service.documents().get(documentId=doc_id).execute()
55
+
56
+ tab_count = len(document.get("tabs", [])) if "tabs" in document else 0
57
+ if tab_count > 0:
58
+ self._log(f"ℹ️ Fetched JSON with {tab_count} tabs.")
59
+
60
+ return document
61
+ # {
62
+ # "content": document,
63
+ # "meta": {
64
+ # "source": "google_docs",
65
+ # "doc_id": doc_id,
66
+ # "title": title,
67
+ # "extension": ".json",
68
+ # "mode": "api_json",
69
+ # },
70
+ # }
71
+ except Exception as e:
72
+ self._log(f"Docs API Failed: {e}", level="error")
73
+ raise e
74
+
75
+ # =========================================================
76
+ # Mode 2: HTML (Drive API Export)
77
+ # =========================================================
78
+ def _fetch_as_html(self, creds, doc_id, title) -> Dict[str, Any]:
79
+ try:
80
+ service = build("drive", "v3", credentials=creds)
81
+
82
+ request = service.files().export_media(fileId=doc_id, mimeType="text/html")
83
+
84
+ fh = io.BytesIO()
85
+ downloader = MediaIoBaseDownload(fh, request)
86
+ done = False
87
+ while done is False:
88
+ status, done = downloader.next_chunk()
89
+
90
+ raw_bytes = fh.getvalue()
91
+ html_str = raw_bytes.decode("utf-8", errors="replace")
92
+ decoded_html = html.unescape(html_str)
93
+ formatted_html = re.sub(
94
+ r"(</(p|div|h[1-6]|li|ul|ol|table|tr|blockquote)>)",
95
+ r"\1\n",
96
+ decoded_html,
97
+ flags=re.IGNORECASE,
98
+ )
99
+
100
+ return formatted_html
101
+ # {
102
+ # "content": human_readable_html,
103
+ # "meta": {
104
+ # "source": "google_docs",
105
+ # "doc_id": doc_id,
106
+ # "title": title,
107
+ # "extension": ".html",
108
+ # "mode": "export_html",
109
+ # },
110
+ # }
111
+ except Exception as e:
112
+ self._log(f"HTML Export Failed: {e}", level="error")
113
+ raise e
@@ -0,0 +1,151 @@
1
+ import io
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ from sayou.core.registry import register_component
6
+ from sayou.core.schemas import SayouTask
7
+
8
+ from ..interfaces.base_fetcher import BaseFetcher
9
+
10
+ try:
11
+ import chardet
12
+ from google.oauth2.credentials import Credentials
13
+ from googleapiclient.discovery import build
14
+ from googleapiclient.errors import HttpError
15
+ from googleapiclient.http import MediaIoBaseDownload
16
+ except ImportError:
17
+ build = None
18
+ chardet = None
19
+
20
+
21
+ @register_component("fetcher")
22
+ class GoogleDriveFetcher(BaseFetcher):
23
+ """
24
+ Fetches content from Google Drive files.
25
+ - Google Native Formats -> Converted to MS Office formats (.docx, .xlsx, .pptx)
26
+ - Standard Files (PDF, JPG, ZIP...) -> Downloaded as original binary.
27
+ """
28
+
29
+ component_name = "GoogleDriveFetcher"
30
+ SUPPORTED_TYPES = ["drive"]
31
+
32
+ @classmethod
33
+ def can_handle(cls, uri: str) -> float:
34
+ return 1.0 if uri.startswith("gdrive://file/") else 0.0
35
+
36
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
37
+ token_path = task.params.get("token_path")
38
+ file_id = task.params.get("file_id")
39
+ mime_type = task.params.get("mime_type")
40
+ original_name = task.meta.get("filename", "unknown_file")
41
+
42
+ creds = Credentials.from_authorized_user_file(token_path)
43
+ service = build("drive", "v3", credentials=creds)
44
+
45
+ request = None
46
+ extension = ""
47
+ is_google_doc = False
48
+
49
+ # 1. Google Native Formats
50
+ if mime_type == "application/vnd.google-apps.document":
51
+ request = service.files().export_media(
52
+ fileId=file_id,
53
+ mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
54
+ )
55
+ extension = ".docx"
56
+ is_google_doc = True
57
+ elif mime_type == "application/vnd.google-apps.spreadsheet":
58
+ request = service.files().export_media(
59
+ fileId=file_id,
60
+ mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
61
+ )
62
+ extension = ".xlsx"
63
+ is_google_doc = True
64
+ elif mime_type == "application/vnd.google-apps.presentation":
65
+ request = service.files().export_media(
66
+ fileId=file_id,
67
+ mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
68
+ )
69
+ extension = ".pptx"
70
+ is_google_doc = True
71
+ else:
72
+ request = service.files().get_media(fileId=file_id)
73
+ _, ext = os.path.splitext(original_name)
74
+ extension = ext if ext else ""
75
+
76
+ # 2. Execute Download
77
+ try:
78
+ fh = io.BytesIO()
79
+ downloader = MediaIoBaseDownload(fh, request)
80
+ done = False
81
+ while done is False:
82
+ status, done = downloader.next_chunk()
83
+
84
+ raw_bytes = fh.getvalue()
85
+
86
+ final_content = raw_bytes
87
+ is_text_candidate = False
88
+
89
+ if mime_type.startswith("text/") or mime_type == "application/json":
90
+ is_text_candidate = True
91
+ elif extension.lower() in [
92
+ ".csv",
93
+ ".txt",
94
+ ".json",
95
+ ".md",
96
+ ".py",
97
+ ".html",
98
+ ".xml",
99
+ ]:
100
+ is_text_candidate = True
101
+
102
+ if not is_google_doc and is_text_candidate:
103
+ detected = chardet.detect(raw_bytes)
104
+ encoding = detected.get("encoding")
105
+ confidence = detected.get("confidence", 0)
106
+
107
+ # 2) EUC-KR -> UTF-8
108
+ if (
109
+ encoding
110
+ and encoding.lower() not in ["utf-8", "ascii"]
111
+ and confidence > 0.6
112
+ ):
113
+ try:
114
+ # Decode (Bytes -> Str)
115
+ text_content = raw_bytes.decode(encoding)
116
+ # Encode back to Bytes (Str -> UTF-8 Bytes)
117
+ final_content = text_content.encode("utf-8")
118
+ self._log(
119
+ f"Transcoded {original_name} from {encoding} to utf-8 bytes."
120
+ )
121
+ except Exception as e:
122
+ self._log(
123
+ f"Encoding conversion failed: {e}. Keeping raw bytes.",
124
+ level="warning",
125
+ )
126
+ final_content = raw_bytes
127
+
128
+ # 3. Return
129
+ return {
130
+ "content": final_content,
131
+ "meta": {
132
+ "source": "google_drive",
133
+ "file_id": file_id,
134
+ "mime_type": mime_type,
135
+ "original_filename": original_name,
136
+ "suggested_filename": (
137
+ f"{original_name}{extension}"
138
+ if not original_name.endswith(extension)
139
+ else original_name
140
+ ),
141
+ "extension": extension,
142
+ "is_binary": isinstance(final_content, bytes),
143
+ },
144
+ }
145
+
146
+ except HttpError as e:
147
+ self._log(f"Drive Download Failed ({file_id}): {e}", level="error")
148
+ return {
149
+ "content": b"",
150
+ "meta": {"source": "google_drive", "error": str(e), "file_id": file_id},
151
+ }