sayou-connector 0.3.7__tar.gz → 0.3.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/PKG-INFO +1 -1
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/pyproject.toml +1 -1
- sayou_connector-0.3.16/src/sayou/connector/plugins/gmail_fetcher.py +128 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/gmail_generator.py +81 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_calendar_fetcher.py +89 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_calendar_generator.py +46 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_docs_fetcher.py +113 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_drive_fetcher.py +151 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_drive_generator.py +107 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_sheets_fetcher.py +85 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_slides_fetcher.py +99 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_youtube_fetcher.py +60 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/google_youtube_generator.py +86 -0
- sayou_connector-0.3.7/src/sayou/connector/plugins/gmail_fetcher.py → sayou_connector-0.3.16/src/sayou/connector/plugins/imap_email_fetcher.py +23 -10
- sayou_connector-0.3.7/src/sayou/connector/plugins/gmail_generator.py → sayou_connector-0.3.16/src/sayou/connector/plugins/imap_email_generator.py +24 -12
- sayou_connector-0.3.16/src/sayou/connector/plugins/public_youtube_fetcher.py +134 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/public_youtube_generator.py +60 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/trafilatura_fetcher.py +51 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/trafilatura_generator.py +32 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/wikipedia_fetcher.py +41 -0
- sayou_connector-0.3.16/src/sayou/connector/plugins/wikipedia_generator.py +43 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/.gitignore +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/README.md +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/examples/quick_start.ipynb +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/examples/quick_start.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/__init__.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/core/exceptions.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/fetcher/file_fetcher.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/fetcher/requests_fetcher.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/fetcher/sqlite_fetcher.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/generator/file_generator.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/generator/requests_generator.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/generator/sqlite_generator.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/interfaces/base_fetcher.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/interfaces/base_generator.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/pipeline.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/plugins/notion_fetcher.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/src/sayou/connector/plugins/notion_generator.py +0 -0
- {sayou_connector-0.3.7 → sayou_connector-0.3.16}/tests/test_connector.py +0 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from google.oauth2.credentials import Credentials
|
|
11
|
+
from googleapiclient.discovery import build
|
|
12
|
+
except ImportError:
|
|
13
|
+
build = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_component("fetcher")
|
|
17
|
+
class GmailFetcher(BaseFetcher):
|
|
18
|
+
"""
|
|
19
|
+
Fetches specific email content using Gmail API.
|
|
20
|
+
Reconstructs the email into a standardized HTML format suitable for Refinery.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
component_name = "GmailFetcher"
|
|
24
|
+
SUPPORTED_TYPES = ["gmail"]
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def can_handle(cls, uri: str) -> float:
|
|
28
|
+
return 1.0 if uri.startswith("gmail-msg://") else 0.0
|
|
29
|
+
|
|
30
|
+
def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
|
|
31
|
+
token_path = task.params.get("token_path")
|
|
32
|
+
msg_id = task.params.get("msg_id")
|
|
33
|
+
|
|
34
|
+
if not build:
|
|
35
|
+
raise ImportError("Please install google-api-python-client")
|
|
36
|
+
|
|
37
|
+
creds = Credentials.from_authorized_user_file(token_path)
|
|
38
|
+
service = build("gmail", "v1", credentials=creds)
|
|
39
|
+
|
|
40
|
+
# 1. Fetch email details (format='full')
|
|
41
|
+
message = (
|
|
42
|
+
service.users()
|
|
43
|
+
.messages()
|
|
44
|
+
.get(userId="me", id=msg_id, format="full")
|
|
45
|
+
.execute()
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
payload = message.get("payload", {})
|
|
49
|
+
headers = payload.get("headers", [])
|
|
50
|
+
|
|
51
|
+
# 2. Parse headers (Subject, From, Date)
|
|
52
|
+
subject = self._get_header(headers, "Subject", "(No Subject)")
|
|
53
|
+
sender = self._get_header(headers, "From", "Unknown")
|
|
54
|
+
date = self._get_header(headers, "Date", "")
|
|
55
|
+
|
|
56
|
+
# 3. Extract body (Recursive)
|
|
57
|
+
body_content = self._extract_body(payload)
|
|
58
|
+
|
|
59
|
+
# 4. Reconstruct HTML (User Request Format)
|
|
60
|
+
html_doc = f"""<!DOCTYPE html>
|
|
61
|
+
<html>
|
|
62
|
+
<head>
|
|
63
|
+
<title>{subject}</title>
|
|
64
|
+
<meta name="sender" content="{sender}">
|
|
65
|
+
<meta name="date" content="{date}">
|
|
66
|
+
<meta name="msg_id" content="{msg_id}">
|
|
67
|
+
<meta name="source" content="gmail">
|
|
68
|
+
</head>
|
|
69
|
+
<body>
|
|
70
|
+
{body_content}
|
|
71
|
+
</body>
|
|
72
|
+
</html>"""
|
|
73
|
+
|
|
74
|
+
return html_doc.strip()
|
|
75
|
+
|
|
76
|
+
def _get_header(self, headers: list, name: str, default: str) -> str:
|
|
77
|
+
for h in headers:
|
|
78
|
+
if h["name"].lower() == name.lower():
|
|
79
|
+
return h["value"]
|
|
80
|
+
return default
|
|
81
|
+
|
|
82
|
+
def _extract_body(self, payload: dict) -> str:
|
|
83
|
+
body = ""
|
|
84
|
+
|
|
85
|
+
# Case A: Single Part
|
|
86
|
+
if "body" in payload and payload["body"].get("data"):
|
|
87
|
+
mime_type = payload.get("mimeType", "")
|
|
88
|
+
data = payload["body"]["data"]
|
|
89
|
+
decoded_text = self._decode_base64url(data)
|
|
90
|
+
|
|
91
|
+
if mime_type == "text/html":
|
|
92
|
+
return decoded_text
|
|
93
|
+
elif mime_type == "text/plain":
|
|
94
|
+
return f"<pre>{decoded_text}</pre>"
|
|
95
|
+
|
|
96
|
+
# Case B: Multi Part
|
|
97
|
+
if "parts" in payload:
|
|
98
|
+
html_part = None
|
|
99
|
+
text_part = None
|
|
100
|
+
|
|
101
|
+
for part in payload["parts"]:
|
|
102
|
+
mime_type = part.get("mimeType", "")
|
|
103
|
+
|
|
104
|
+
content = self._extract_body(part)
|
|
105
|
+
|
|
106
|
+
if mime_type == "text/html":
|
|
107
|
+
html_part = content
|
|
108
|
+
elif mime_type == "text/plain":
|
|
109
|
+
text_part = content
|
|
110
|
+
elif "multipart" in mime_type:
|
|
111
|
+
if content:
|
|
112
|
+
html_part = content
|
|
113
|
+
|
|
114
|
+
if html_part:
|
|
115
|
+
return html_part
|
|
116
|
+
if text_part:
|
|
117
|
+
return text_part
|
|
118
|
+
|
|
119
|
+
return body
|
|
120
|
+
|
|
121
|
+
def _decode_base64url(self, data: str) -> str:
|
|
122
|
+
try:
|
|
123
|
+
padding = len(data) % 4
|
|
124
|
+
if padding:
|
|
125
|
+
data += "=" * (4 - padding)
|
|
126
|
+
return base64.urlsafe_b64decode(data).decode("utf-8", errors="replace")
|
|
127
|
+
except Exception:
|
|
128
|
+
return ""
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
from sayou.core.registry import register_component
|
|
4
|
+
from sayou.core.schemas import SayouTask
|
|
5
|
+
|
|
6
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from google.oauth2.credentials import Credentials
|
|
10
|
+
from googleapiclient.discovery import build
|
|
11
|
+
except ImportError:
|
|
12
|
+
build = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_component("generator")
|
|
16
|
+
class GmailGenerator(BaseGenerator):
|
|
17
|
+
"""
|
|
18
|
+
Scans Gmail inbox using Gmail API (OAuth) and generates tasks.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
component_name = "GmailGenerator"
|
|
22
|
+
SUPPORTED_TYPES = ["gmail"]
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def can_handle(cls, source: str) -> float:
|
|
26
|
+
return 1.0 if source.startswith("gmail://") else 0.0
|
|
27
|
+
|
|
28
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
29
|
+
"""
|
|
30
|
+
Connects to Gmail API -> Search (List) -> Yield Tasks.
|
|
31
|
+
source example: gmail://me (default) or gmail://me?q=is:unread
|
|
32
|
+
"""
|
|
33
|
+
if not build:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
"Please install google-api-python-client google-auth-oauthlib"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
token_path = kwargs.get("token_path")
|
|
39
|
+
if not token_path:
|
|
40
|
+
raise ValueError("GmailGenerator requires 'token_path' in kwargs.")
|
|
41
|
+
|
|
42
|
+
# 1. Parsing Parameters
|
|
43
|
+
query = kwargs.get("query", "is:inbox")
|
|
44
|
+
max_results = int(kwargs.get("limit", 10))
|
|
45
|
+
|
|
46
|
+
# 2. Connect to Gmail API
|
|
47
|
+
creds = Credentials.from_authorized_user_file(token_path)
|
|
48
|
+
service = build("gmail", "v1", credentials=creds)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# 3. Fetch email list
|
|
52
|
+
results = (
|
|
53
|
+
service.users()
|
|
54
|
+
.messages()
|
|
55
|
+
.list(userId="me", q=query, maxResults=max_results)
|
|
56
|
+
.execute()
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
messages = results.get("messages", [])
|
|
60
|
+
|
|
61
|
+
self._log(f"📧 Found {len(messages)} emails. Generating tasks...")
|
|
62
|
+
|
|
63
|
+
# 4. Generate tasks
|
|
64
|
+
for msg in messages:
|
|
65
|
+
msg_id = msg["id"]
|
|
66
|
+
thread_id = msg["threadId"]
|
|
67
|
+
task_uri = f"gmail-msg://{msg_id}"
|
|
68
|
+
|
|
69
|
+
yield SayouTask(
|
|
70
|
+
uri=task_uri,
|
|
71
|
+
source_type="gmail",
|
|
72
|
+
params={
|
|
73
|
+
"token_path": token_path,
|
|
74
|
+
"msg_id": msg_id,
|
|
75
|
+
"thread_id": thread_id,
|
|
76
|
+
},
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
self._log(f"Gmail API List failed: {e}", level="error")
|
|
81
|
+
raise e
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from sayou.core.registry import register_component
|
|
6
|
+
from sayou.core.schemas import SayouTask
|
|
7
|
+
|
|
8
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from google.oauth2.credentials import Credentials
|
|
12
|
+
from googleapiclient.discovery import build
|
|
13
|
+
except ImportError:
|
|
14
|
+
build = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_component("fetcher")
|
|
18
|
+
class GoogleCalendarFetcher(BaseFetcher):
|
|
19
|
+
"""
|
|
20
|
+
Fetches events using Google API with User OAuth Token.
|
|
21
|
+
Works for Workspace (Corporate) & Personal accounts.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
component_name = "GoogleCalendarFetcher"
|
|
25
|
+
SUPPORTED_TYPES = ["google_calendar"]
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def can_handle(cls, uri: str) -> float:
|
|
29
|
+
return 1.0 if uri.startswith("gcal://") else 0.0
|
|
30
|
+
|
|
31
|
+
def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
|
|
32
|
+
if not build:
|
|
33
|
+
raise ImportError("Google API libraries required.")
|
|
34
|
+
|
|
35
|
+
token_path = task.params.get("token_path")
|
|
36
|
+
if not token_path or not os.path.exists(token_path):
|
|
37
|
+
raise ValueError("Token path invalid.")
|
|
38
|
+
|
|
39
|
+
creds = Credentials.from_authorized_user_file(token_path)
|
|
40
|
+
|
|
41
|
+
service = build("calendar", "v3", credentials=creds)
|
|
42
|
+
|
|
43
|
+
now = datetime.datetime.utcnow()
|
|
44
|
+
time_min = (now - datetime.timedelta(days=30)).isoformat() + "Z"
|
|
45
|
+
time_max = (now + datetime.timedelta(days=30)).isoformat() + "Z"
|
|
46
|
+
|
|
47
|
+
events_result = (
|
|
48
|
+
service.events()
|
|
49
|
+
.list(
|
|
50
|
+
calendarId="primary",
|
|
51
|
+
timeMin=time_min,
|
|
52
|
+
timeMax=time_max,
|
|
53
|
+
singleEvents=True,
|
|
54
|
+
orderBy="startTime",
|
|
55
|
+
)
|
|
56
|
+
.execute()
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
events = events_result.get("items", [])
|
|
60
|
+
|
|
61
|
+
parsed_events = []
|
|
62
|
+
for event in events:
|
|
63
|
+
start = event["start"].get("dateTime", event["start"].get("date"))
|
|
64
|
+
end = event["end"].get("dateTime", event["end"].get("date"))
|
|
65
|
+
|
|
66
|
+
parsed_events.append(
|
|
67
|
+
{
|
|
68
|
+
"id": event.get("id"),
|
|
69
|
+
"summary": event.get("summary", "No Title"),
|
|
70
|
+
"description": event.get("description", ""),
|
|
71
|
+
"start": start,
|
|
72
|
+
"end": end,
|
|
73
|
+
"location": event.get("location", ""),
|
|
74
|
+
"htmlLink": event.get("htmlLink", ""),
|
|
75
|
+
"attendees": [
|
|
76
|
+
{"email": a.get("email"), "status": a.get("responseStatus")}
|
|
77
|
+
for a in event.get("attendees", [])
|
|
78
|
+
],
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
"content": parsed_events,
|
|
84
|
+
"meta": {
|
|
85
|
+
"source": "google_calendar",
|
|
86
|
+
"account": "authenticated_user",
|
|
87
|
+
"count": len(parsed_events),
|
|
88
|
+
},
|
|
89
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Iterator
|
|
3
|
+
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_component("generator")
|
|
11
|
+
class GoogleCalendarGenerator(BaseGenerator):
|
|
12
|
+
"""
|
|
13
|
+
Generates tasks using OAuth 2.0 Token.
|
|
14
|
+
Requires 'sayou_google_token.json' (generated by auth script).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
component_name = "GoogleCalendarGenerator"
|
|
18
|
+
SUPPORTED_TYPES = ["google_calendar"]
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def can_handle(cls, uri: str) -> float:
|
|
22
|
+
return 1.0 if uri.startswith("gcal://") else 0.0
|
|
23
|
+
|
|
24
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
25
|
+
token_path = kwargs.get("google_token_path")
|
|
26
|
+
|
|
27
|
+
if not os.path.exists(token_path):
|
|
28
|
+
raise FileNotFoundError(
|
|
29
|
+
f"Google Token not found at {token_path}. Run authentication script first."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
calendar_id = source.replace("gcal://", "") or "primary"
|
|
33
|
+
|
|
34
|
+
yield SayouTask(
|
|
35
|
+
uri=source,
|
|
36
|
+
source_type="google_calendar",
|
|
37
|
+
params={
|
|
38
|
+
"calendar_id": calendar_id,
|
|
39
|
+
"token_path": token_path,
|
|
40
|
+
},
|
|
41
|
+
meta={
|
|
42
|
+
"source": "google_calendar",
|
|
43
|
+
"calendar_id": calendar_id,
|
|
44
|
+
"filename": f"calendar_{calendar_id.replace('@', '_')}",
|
|
45
|
+
},
|
|
46
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import html
|
|
2
|
+
import io
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from sayou.core.registry import register_component
|
|
7
|
+
from sayou.core.schemas import SayouTask
|
|
8
|
+
|
|
9
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from google.oauth2.credentials import Credentials
|
|
13
|
+
from googleapiclient.discovery import build
|
|
14
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
15
|
+
except ImportError:
|
|
16
|
+
build = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@register_component("fetcher")
|
|
20
|
+
class GoogleDocsFetcher(BaseFetcher):
|
|
21
|
+
"""
|
|
22
|
+
Fetches content from Google Docs using the Docs API (v1).
|
|
23
|
+
Extracts text and converts basic styling to Markdown.
|
|
24
|
+
Bypasses the 10MB export limit of Drive API.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
component_name = "GoogleDocsFetcher"
|
|
28
|
+
SUPPORTED_TYPES = ["docs"]
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def can_handle(cls, uri: str) -> float:
|
|
32
|
+
return 1.0 if uri.startswith("gdocs://document/") else 0.0
|
|
33
|
+
|
|
34
|
+
def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
|
|
35
|
+
token_path = task.params.get("token_path")
|
|
36
|
+
doc_id = task.uri.replace("gdocs://document/", "").split("/")[0]
|
|
37
|
+
original_title = task.meta.get("filename", "Untitled")
|
|
38
|
+
|
|
39
|
+
fetch_mode = task.params.get("fetch_mode", "html")
|
|
40
|
+
|
|
41
|
+
creds = Credentials.from_authorized_user_file(token_path)
|
|
42
|
+
|
|
43
|
+
if fetch_mode == "html":
|
|
44
|
+
return self._fetch_as_html(creds, doc_id, original_title)
|
|
45
|
+
else:
|
|
46
|
+
return self._fetch_as_json(creds, doc_id, original_title)
|
|
47
|
+
|
|
48
|
+
# =========================================================
|
|
49
|
+
# Mode 1: JSON (Docs API)
|
|
50
|
+
# =========================================================
|
|
51
|
+
def _fetch_as_json(self, creds, doc_id, title) -> Dict[str, Any]:
|
|
52
|
+
try:
|
|
53
|
+
service = build("docs", "v1", credentials=creds)
|
|
54
|
+
document = service.documents().get(documentId=doc_id).execute()
|
|
55
|
+
|
|
56
|
+
tab_count = len(document.get("tabs", [])) if "tabs" in document else 0
|
|
57
|
+
if tab_count > 0:
|
|
58
|
+
self._log(f"ℹ️ Fetched JSON with {tab_count} tabs.")
|
|
59
|
+
|
|
60
|
+
return document
|
|
61
|
+
# {
|
|
62
|
+
# "content": document,
|
|
63
|
+
# "meta": {
|
|
64
|
+
# "source": "google_docs",
|
|
65
|
+
# "doc_id": doc_id,
|
|
66
|
+
# "title": title,
|
|
67
|
+
# "extension": ".json",
|
|
68
|
+
# "mode": "api_json",
|
|
69
|
+
# },
|
|
70
|
+
# }
|
|
71
|
+
except Exception as e:
|
|
72
|
+
self._log(f"Docs API Failed: {e}", level="error")
|
|
73
|
+
raise e
|
|
74
|
+
|
|
75
|
+
# =========================================================
|
|
76
|
+
# Mode 2: HTML (Drive API Export)
|
|
77
|
+
# =========================================================
|
|
78
|
+
def _fetch_as_html(self, creds, doc_id, title) -> Dict[str, Any]:
|
|
79
|
+
try:
|
|
80
|
+
service = build("drive", "v3", credentials=creds)
|
|
81
|
+
|
|
82
|
+
request = service.files().export_media(fileId=doc_id, mimeType="text/html")
|
|
83
|
+
|
|
84
|
+
fh = io.BytesIO()
|
|
85
|
+
downloader = MediaIoBaseDownload(fh, request)
|
|
86
|
+
done = False
|
|
87
|
+
while done is False:
|
|
88
|
+
status, done = downloader.next_chunk()
|
|
89
|
+
|
|
90
|
+
raw_bytes = fh.getvalue()
|
|
91
|
+
html_str = raw_bytes.decode("utf-8", errors="replace")
|
|
92
|
+
decoded_html = html.unescape(html_str)
|
|
93
|
+
formatted_html = re.sub(
|
|
94
|
+
r"(</(p|div|h[1-6]|li|ul|ol|table|tr|blockquote)>)",
|
|
95
|
+
r"\1\n",
|
|
96
|
+
decoded_html,
|
|
97
|
+
flags=re.IGNORECASE,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return formatted_html
|
|
101
|
+
# {
|
|
102
|
+
# "content": human_readable_html,
|
|
103
|
+
# "meta": {
|
|
104
|
+
# "source": "google_docs",
|
|
105
|
+
# "doc_id": doc_id,
|
|
106
|
+
# "title": title,
|
|
107
|
+
# "extension": ".html",
|
|
108
|
+
# "mode": "export_html",
|
|
109
|
+
# },
|
|
110
|
+
# }
|
|
111
|
+
except Exception as e:
|
|
112
|
+
self._log(f"HTML Export Failed: {e}", level="error")
|
|
113
|
+
raise e
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from sayou.core.registry import register_component
|
|
6
|
+
from sayou.core.schemas import SayouTask
|
|
7
|
+
|
|
8
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import chardet
|
|
12
|
+
from google.oauth2.credentials import Credentials
|
|
13
|
+
from googleapiclient.discovery import build
|
|
14
|
+
from googleapiclient.errors import HttpError
|
|
15
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
16
|
+
except ImportError:
|
|
17
|
+
build = None
|
|
18
|
+
chardet = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@register_component("fetcher")
|
|
22
|
+
class GoogleDriveFetcher(BaseFetcher):
|
|
23
|
+
"""
|
|
24
|
+
Fetches content from Google Drive files.
|
|
25
|
+
- Google Native Formats -> Converted to MS Office formats (.docx, .xlsx, .pptx)
|
|
26
|
+
- Standard Files (PDF, JPG, ZIP...) -> Downloaded as original binary.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
component_name = "GoogleDriveFetcher"
|
|
30
|
+
SUPPORTED_TYPES = ["drive"]
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def can_handle(cls, uri: str) -> float:
|
|
34
|
+
return 1.0 if uri.startswith("gdrive://file/") else 0.0
|
|
35
|
+
|
|
36
|
+
def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
|
|
37
|
+
token_path = task.params.get("token_path")
|
|
38
|
+
file_id = task.params.get("file_id")
|
|
39
|
+
mime_type = task.params.get("mime_type")
|
|
40
|
+
original_name = task.meta.get("filename", "unknown_file")
|
|
41
|
+
|
|
42
|
+
creds = Credentials.from_authorized_user_file(token_path)
|
|
43
|
+
service = build("drive", "v3", credentials=creds)
|
|
44
|
+
|
|
45
|
+
request = None
|
|
46
|
+
extension = ""
|
|
47
|
+
is_google_doc = False
|
|
48
|
+
|
|
49
|
+
# 1. Google Native Formats
|
|
50
|
+
if mime_type == "application/vnd.google-apps.document":
|
|
51
|
+
request = service.files().export_media(
|
|
52
|
+
fileId=file_id,
|
|
53
|
+
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
54
|
+
)
|
|
55
|
+
extension = ".docx"
|
|
56
|
+
is_google_doc = True
|
|
57
|
+
elif mime_type == "application/vnd.google-apps.spreadsheet":
|
|
58
|
+
request = service.files().export_media(
|
|
59
|
+
fileId=file_id,
|
|
60
|
+
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
61
|
+
)
|
|
62
|
+
extension = ".xlsx"
|
|
63
|
+
is_google_doc = True
|
|
64
|
+
elif mime_type == "application/vnd.google-apps.presentation":
|
|
65
|
+
request = service.files().export_media(
|
|
66
|
+
fileId=file_id,
|
|
67
|
+
mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
68
|
+
)
|
|
69
|
+
extension = ".pptx"
|
|
70
|
+
is_google_doc = True
|
|
71
|
+
else:
|
|
72
|
+
request = service.files().get_media(fileId=file_id)
|
|
73
|
+
_, ext = os.path.splitext(original_name)
|
|
74
|
+
extension = ext if ext else ""
|
|
75
|
+
|
|
76
|
+
# 2. Execute Download
|
|
77
|
+
try:
|
|
78
|
+
fh = io.BytesIO()
|
|
79
|
+
downloader = MediaIoBaseDownload(fh, request)
|
|
80
|
+
done = False
|
|
81
|
+
while done is False:
|
|
82
|
+
status, done = downloader.next_chunk()
|
|
83
|
+
|
|
84
|
+
raw_bytes = fh.getvalue()
|
|
85
|
+
|
|
86
|
+
final_content = raw_bytes
|
|
87
|
+
is_text_candidate = False
|
|
88
|
+
|
|
89
|
+
if mime_type.startswith("text/") or mime_type == "application/json":
|
|
90
|
+
is_text_candidate = True
|
|
91
|
+
elif extension.lower() in [
|
|
92
|
+
".csv",
|
|
93
|
+
".txt",
|
|
94
|
+
".json",
|
|
95
|
+
".md",
|
|
96
|
+
".py",
|
|
97
|
+
".html",
|
|
98
|
+
".xml",
|
|
99
|
+
]:
|
|
100
|
+
is_text_candidate = True
|
|
101
|
+
|
|
102
|
+
if not is_google_doc and is_text_candidate:
|
|
103
|
+
detected = chardet.detect(raw_bytes)
|
|
104
|
+
encoding = detected.get("encoding")
|
|
105
|
+
confidence = detected.get("confidence", 0)
|
|
106
|
+
|
|
107
|
+
# 2) EUC-KR -> UTF-8
|
|
108
|
+
if (
|
|
109
|
+
encoding
|
|
110
|
+
and encoding.lower() not in ["utf-8", "ascii"]
|
|
111
|
+
and confidence > 0.6
|
|
112
|
+
):
|
|
113
|
+
try:
|
|
114
|
+
# Decode (Bytes -> Str)
|
|
115
|
+
text_content = raw_bytes.decode(encoding)
|
|
116
|
+
# Encode back to Bytes (Str -> UTF-8 Bytes)
|
|
117
|
+
final_content = text_content.encode("utf-8")
|
|
118
|
+
self._log(
|
|
119
|
+
f"Transcoded {original_name} from {encoding} to utf-8 bytes."
|
|
120
|
+
)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
self._log(
|
|
123
|
+
f"Encoding conversion failed: {e}. Keeping raw bytes.",
|
|
124
|
+
level="warning",
|
|
125
|
+
)
|
|
126
|
+
final_content = raw_bytes
|
|
127
|
+
|
|
128
|
+
# 3. Return
|
|
129
|
+
return {
|
|
130
|
+
"content": final_content,
|
|
131
|
+
"meta": {
|
|
132
|
+
"source": "google_drive",
|
|
133
|
+
"file_id": file_id,
|
|
134
|
+
"mime_type": mime_type,
|
|
135
|
+
"original_filename": original_name,
|
|
136
|
+
"suggested_filename": (
|
|
137
|
+
f"{original_name}{extension}"
|
|
138
|
+
if not original_name.endswith(extension)
|
|
139
|
+
else original_name
|
|
140
|
+
),
|
|
141
|
+
"extension": extension,
|
|
142
|
+
"is_binary": isinstance(final_content, bytes),
|
|
143
|
+
},
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
except HttpError as e:
|
|
147
|
+
self._log(f"Drive Download Failed ({file_id}): {e}", level="error")
|
|
148
|
+
return {
|
|
149
|
+
"content": b"",
|
|
150
|
+
"meta": {"source": "google_drive", "error": str(e), "file_id": file_id},
|
|
151
|
+
}
|