sayou-connector 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sayou/connector/__init__.py +11 -0
- sayou/connector/core/exceptions.py +38 -0
- sayou/connector/fetcher/file_fetcher.py +42 -0
- sayou/connector/fetcher/requests_fetcher.py +77 -0
- sayou/connector/fetcher/sqlite_fetcher.py +50 -0
- sayou/connector/generator/file_generator.py +124 -0
- sayou/connector/generator/requests_generator.py +113 -0
- sayou/connector/generator/sqlite_generator.py +140 -0
- sayou/connector/interfaces/base_fetcher.py +81 -0
- sayou/connector/interfaces/base_generator.py +99 -0
- sayou/connector/pipeline.py +304 -0
- sayou/connector/plugins/gmail_fetcher.py +127 -0
- sayou/connector/plugins/gmail_generator.py +79 -0
- sayou/connector/plugins/google_calendar_fetcher.py +89 -0
- sayou/connector/plugins/google_calendar_generator.py +46 -0
- sayou/connector/plugins/google_drive_fetcher.py +151 -0
- sayou/connector/plugins/google_drive_generator.py +107 -0
- sayou/connector/plugins/imap_email_fetcher.py +140 -0
- sayou/connector/plugins/imap_email_generator.py +93 -0
- sayou/connector/plugins/notion_fetcher.py +301 -0
- sayou/connector/plugins/notion_generator.py +73 -0
- sayou/connector/plugins/public_youtube_fetcher.py +134 -0
- sayou/connector/plugins/public_youtube_generator.py +60 -0
- sayou_connector-0.3.12.dist-info/METADATA +303 -0
- sayou_connector-0.3.12.dist-info/RECORD +26 -0
- sayou_connector-0.3.12.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
from sayou.core.registry import register_component
|
|
6
|
+
from sayou.core.schemas import SayouTask
|
|
7
|
+
|
|
8
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_component("fetcher")
|
|
12
|
+
class NotionFetcher(BaseFetcher):
|
|
13
|
+
"""
|
|
14
|
+
Fetches Notion content.
|
|
15
|
+
Supports:
|
|
16
|
+
1. Pages (Text, Media, Simple Tables)
|
|
17
|
+
2. Inline Databases (Databases embedded inside pages)
|
|
18
|
+
3. Full Databases (URL with ?v=...)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
component_name = "NotionFetcher"
|
|
22
|
+
SUPPORTED_TYPES = ["notion"]
|
|
23
|
+
|
|
24
|
+
UUID_PATTERN = re.compile(
|
|
25
|
+
r"[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}",
|
|
26
|
+
re.IGNORECASE,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def can_handle(cls, uri: str) -> float:
|
|
31
|
+
if uri.startswith("notion://"):
|
|
32
|
+
return 1.0
|
|
33
|
+
if "notion.so" in uri or "notion.site" in uri:
|
|
34
|
+
return 1.0
|
|
35
|
+
return 0.0
|
|
36
|
+
|
|
37
|
+
def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
|
|
38
|
+
token = task.params.get("notion_token")
|
|
39
|
+
if not token:
|
|
40
|
+
raise ValueError("[NotionFetcher] 'notion_token' is required.")
|
|
41
|
+
|
|
42
|
+
self.headers = {
|
|
43
|
+
"Authorization": f"Bearer {token}",
|
|
44
|
+
"Content-Type": "application/json",
|
|
45
|
+
"Notion-Version": "2022-06-28",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
resource_id = self._extract_id(task.uri)
|
|
49
|
+
if not resource_id:
|
|
50
|
+
raise ValueError(f"Invalid Notion ID in URI: {task.uri}")
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
return self._fetch_as_page(resource_id)
|
|
54
|
+
except RuntimeError:
|
|
55
|
+
self._log(
|
|
56
|
+
f"ID {resource_id} is not a Page. Trying Database...", level="debug"
|
|
57
|
+
)
|
|
58
|
+
return self._fetch_as_database_root(resource_id)
|
|
59
|
+
|
|
60
|
+
# --- Mode A: Page (Recursive) ---
|
|
61
|
+
def _fetch_as_page(self, page_id: str) -> Dict[str, Any]:
|
|
62
|
+
url = f"https://api.notion.com/v1/pages/{page_id}"
|
|
63
|
+
resp = requests.get(url, headers=self.headers)
|
|
64
|
+
|
|
65
|
+
if resp.status_code != 200:
|
|
66
|
+
raise RuntimeError(f"Status {resp.status_code}")
|
|
67
|
+
|
|
68
|
+
page_meta = resp.json()
|
|
69
|
+
title = self._extract_title_prop(page_meta)
|
|
70
|
+
|
|
71
|
+
md_content = f"# {title}\n\n"
|
|
72
|
+
root_blocks = self._get_children_recursive(page_id)
|
|
73
|
+
for block in root_blocks:
|
|
74
|
+
md_content += self._block_to_markdown(block) + "\n"
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
"content": md_content,
|
|
78
|
+
"meta": {
|
|
79
|
+
"source": "notion",
|
|
80
|
+
"type": "page",
|
|
81
|
+
"page_id": page_id,
|
|
82
|
+
"title": title,
|
|
83
|
+
"url": page_meta.get("url", ""),
|
|
84
|
+
"filename": f"{self._sanitize_filename(title)}.md",
|
|
85
|
+
},
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# --- Mode B: Database (Root) ---
|
|
89
|
+
def _fetch_as_database_root(self, db_id: str) -> Dict[str, Any]:
|
|
90
|
+
url = f"https://api.notion.com/v1/databases/{db_id}"
|
|
91
|
+
resp = requests.get(url, headers=self.headers)
|
|
92
|
+
if resp.status_code != 200:
|
|
93
|
+
raise RuntimeError(f"Access Failed: {db_id}")
|
|
94
|
+
|
|
95
|
+
db_meta = resp.json()
|
|
96
|
+
title = (
|
|
97
|
+
"".join([t.get("plain_text", "") for t in db_meta.get("title", [])])
|
|
98
|
+
or "Untitled DB"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
md_table = self._query_and_render_database(db_id)
|
|
102
|
+
|
|
103
|
+
full_md = f"# [DB] {title}\n\n{md_table}"
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
"content": full_md,
|
|
107
|
+
"meta": {
|
|
108
|
+
"source": "notion",
|
|
109
|
+
"type": "database",
|
|
110
|
+
"page_id": db_id,
|
|
111
|
+
"title": title,
|
|
112
|
+
"filename": f"DB_{self._sanitize_filename(title)}.md",
|
|
113
|
+
},
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# --- Core Logic: Block to Markdown ---
|
|
117
|
+
def _block_to_markdown(self, block: Dict, indent: int = 0) -> str:
|
|
118
|
+
b_type = block.get("type")
|
|
119
|
+
prefix = " " * indent
|
|
120
|
+
|
|
121
|
+
# [Inline Database]
|
|
122
|
+
if b_type == "child_database":
|
|
123
|
+
db_id = block["id"]
|
|
124
|
+
db_title = block.get("child_database", {}).get("title", "Inline Database")
|
|
125
|
+
table_md = self._query_and_render_database(db_id)
|
|
126
|
+
|
|
127
|
+
return f"\n{prefix}### 📂 {db_title}\n{table_md}\n"
|
|
128
|
+
|
|
129
|
+
# [Simple Table]
|
|
130
|
+
if b_type == "table":
|
|
131
|
+
rows = block.get("children_data", [])
|
|
132
|
+
if not rows:
|
|
133
|
+
return ""
|
|
134
|
+
md_lines = []
|
|
135
|
+
has_header = block.get("table", {}).get("has_column_header", False)
|
|
136
|
+
for idx, row in enumerate(rows):
|
|
137
|
+
if row.get("type") == "table_row":
|
|
138
|
+
cells = row.get("table_row", {}).get("cells", [])
|
|
139
|
+
row_txt = "|" + "".join(
|
|
140
|
+
[
|
|
141
|
+
f" {''.join([t.get('plain_text', '') for t in c])} |"
|
|
142
|
+
for c in cells
|
|
143
|
+
]
|
|
144
|
+
)
|
|
145
|
+
md_lines.append(f"{prefix}{row_txt}")
|
|
146
|
+
if has_header and idx == 0:
|
|
147
|
+
md_lines.append(
|
|
148
|
+
f"{prefix}|" + "|".join([" --- " for _ in cells]) + "|"
|
|
149
|
+
)
|
|
150
|
+
return "\n".join(md_lines) + "\n"
|
|
151
|
+
|
|
152
|
+
# Standard Blocks
|
|
153
|
+
content = block.get(b_type, {})
|
|
154
|
+
text = ""
|
|
155
|
+
if "rich_text" in content:
|
|
156
|
+
text = "".join(
|
|
157
|
+
[t.get("plain_text", "") for t in content.get("rich_text", [])]
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
md = ""
|
|
161
|
+
if b_type == "paragraph":
|
|
162
|
+
md = f"{prefix}{text}\n"
|
|
163
|
+
elif b_type.startswith("heading_"):
|
|
164
|
+
md = f"\n{prefix}{'#' * int(b_type[-1])} {text}\n"
|
|
165
|
+
elif b_type == "bulleted_list_item":
|
|
166
|
+
md = f"{prefix}- {text}"
|
|
167
|
+
elif b_type == "numbered_list_item":
|
|
168
|
+
md = f"{prefix}1. {text}"
|
|
169
|
+
elif b_type == "code":
|
|
170
|
+
lang = content.get("language", "text")
|
|
171
|
+
md = f"{prefix}```{lang}\n{prefix}{text}\n{prefix}```"
|
|
172
|
+
elif b_type == "image":
|
|
173
|
+
src = content.get("file", {}).get("url") or content.get("external", {}).get(
|
|
174
|
+
"url"
|
|
175
|
+
)
|
|
176
|
+
md = f"{prefix}"
|
|
177
|
+
|
|
178
|
+
# Recursive Children (except tables/dbs which handle their own data)
|
|
179
|
+
if block.get("children_data") and b_type not in ["table", "child_database"]:
|
|
180
|
+
next_indent = (
|
|
181
|
+
indent + 1
|
|
182
|
+
if b_type in ["toggle", "bulleted_list_item", "numbered_list_item"]
|
|
183
|
+
else indent
|
|
184
|
+
)
|
|
185
|
+
for child in block["children_data"]:
|
|
186
|
+
child_md = self._block_to_markdown(child, next_indent)
|
|
187
|
+
if child_md:
|
|
188
|
+
md += "\n" + child_md
|
|
189
|
+
|
|
190
|
+
return md
|
|
191
|
+
|
|
192
|
+
# --- Helper: Universal Database Renderer ---
|
|
193
|
+
def _query_and_render_database(self, db_id: str) -> str:
|
|
194
|
+
"""
|
|
195
|
+
[핵심] 어떤 DB ID가 들어오든 내용을 조회(Query)해서 Markdown Table 문자열로 반환합니다.
|
|
196
|
+
Root DB Fetch와 Inline DB Fetch 양쪽에서 사용합니다.
|
|
197
|
+
"""
|
|
198
|
+
items = []
|
|
199
|
+
query_url = f"https://api.notion.com/v1/databases/{db_id}/query"
|
|
200
|
+
has_more = True
|
|
201
|
+
next_cursor = None
|
|
202
|
+
|
|
203
|
+
# 1. Fetch All Items
|
|
204
|
+
while has_more:
|
|
205
|
+
payload = {
|
|
206
|
+
"page_size": 100,
|
|
207
|
+
# "sorts": [{"timestamp": "created_time", "direction": "ascending"}],
|
|
208
|
+
}
|
|
209
|
+
if next_cursor:
|
|
210
|
+
payload["start_cursor"] = next_cursor
|
|
211
|
+
r = requests.post(query_url, headers=self.headers, json=payload)
|
|
212
|
+
if r.status_code != 200:
|
|
213
|
+
self._log(f"DB Query Failed {db_id}: {r.text}", level="warning")
|
|
214
|
+
return "> ⚠️ Failed to load database content."
|
|
215
|
+
|
|
216
|
+
d = r.json()
|
|
217
|
+
items.extend(d.get("results", []))
|
|
218
|
+
has_more = d.get("has_more")
|
|
219
|
+
next_cursor = d.get("next_cursor")
|
|
220
|
+
|
|
221
|
+
if not items:
|
|
222
|
+
return "> (Empty Database)"
|
|
223
|
+
|
|
224
|
+
# 2. Extract Headers (from first item)
|
|
225
|
+
first_props = items[0].get("properties", {})
|
|
226
|
+
headers = list(first_props.keys())
|
|
227
|
+
|
|
228
|
+
# 3. Build Markdown Table
|
|
229
|
+
# Header
|
|
230
|
+
md = "| " + " | ".join(headers) + " |\n"
|
|
231
|
+
md += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
|
232
|
+
|
|
233
|
+
# Rows
|
|
234
|
+
for item in items:
|
|
235
|
+
row_cells = []
|
|
236
|
+
props = item.get("properties", {})
|
|
237
|
+
for col in headers:
|
|
238
|
+
val = props.get(col, {})
|
|
239
|
+
txt = self._extract_prop_value(val)
|
|
240
|
+
safe_txt = txt.replace("\n", " ").replace("|", "/")
|
|
241
|
+
row_cells.append(safe_txt)
|
|
242
|
+
md += "| " + " | ".join(row_cells) + " |\n"
|
|
243
|
+
|
|
244
|
+
return md
|
|
245
|
+
|
|
246
|
+
# --- Utils ---
|
|
247
|
+
def _extract_id(self, source: str) -> str:
|
|
248
|
+
if "?" in source:
|
|
249
|
+
source = source.split("?")[0]
|
|
250
|
+
match = self.UUID_PATTERN.search(source)
|
|
251
|
+
return match.group(0) if match else None
|
|
252
|
+
|
|
253
|
+
def _get_children_recursive(self, block_id: str) -> List[Dict]:
|
|
254
|
+
results = []
|
|
255
|
+
url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"
|
|
256
|
+
while url:
|
|
257
|
+
resp = requests.get(url, headers=self.headers)
|
|
258
|
+
if resp.status_code != 200:
|
|
259
|
+
break
|
|
260
|
+
data = resp.json()
|
|
261
|
+
for block in data.get("results", []):
|
|
262
|
+
if block.get("has_children"):
|
|
263
|
+
block["children_data"] = self._get_children_recursive(block["id"])
|
|
264
|
+
results.append(block)
|
|
265
|
+
url = (
|
|
266
|
+
f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100&start_cursor={data['next_cursor']}"
|
|
267
|
+
if data.get("has_more")
|
|
268
|
+
else None
|
|
269
|
+
)
|
|
270
|
+
return results
|
|
271
|
+
|
|
272
|
+
def _extract_title_prop(self, meta: Dict) -> str:
|
|
273
|
+
for v in meta.get("properties", {}).values():
|
|
274
|
+
if v.get("type") == "title":
|
|
275
|
+
return "".join([t.get("plain_text") for t in v.get("title", [])])
|
|
276
|
+
return "Untitled"
|
|
277
|
+
|
|
278
|
+
def _extract_prop_value(self, val: Dict) -> str:
|
|
279
|
+
t = val.get("type")
|
|
280
|
+
if t == "title":
|
|
281
|
+
return "".join([x.get("plain_text") for x in val.get("title", [])])
|
|
282
|
+
if t == "rich_text":
|
|
283
|
+
return "".join([x.get("plain_text") for x in val.get("rich_text", [])])
|
|
284
|
+
if t == "select":
|
|
285
|
+
return val.get("select", {}).get("name") or ""
|
|
286
|
+
if t == "status":
|
|
287
|
+
return val.get("status", {}).get("name") or ""
|
|
288
|
+
if t == "url":
|
|
289
|
+
return val.get("url") or ""
|
|
290
|
+
if t == "date":
|
|
291
|
+
return val.get("date", {}).get("start") or ""
|
|
292
|
+
if t == "checkbox":
|
|
293
|
+
return "Yes" if val.get("checkbox") else "No"
|
|
294
|
+
if t == "email":
|
|
295
|
+
return val.get("email") or ""
|
|
296
|
+
if t == "number":
|
|
297
|
+
return str(val.get("number") or "")
|
|
298
|
+
return ""
|
|
299
|
+
|
|
300
|
+
def _sanitize_filename(self, name: str) -> str:
|
|
301
|
+
return re.sub(r'[\\/*?:"<>|]', "", name).replace(" ", "_")
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_component("generator")
|
|
11
|
+
class NotionGenerator(BaseGenerator):
|
|
12
|
+
"""
|
|
13
|
+
Discovers Notion pages accessible by the integration token.
|
|
14
|
+
Supports:
|
|
15
|
+
- notion://search : Find all pages
|
|
16
|
+
- notion://page/{page_id} : Target specific page
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
component_name = "NotionGenerator"
|
|
20
|
+
SUPPORTED_TYPES = ["notion"]
|
|
21
|
+
|
|
22
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
23
|
+
token = kwargs.get("notion_token")
|
|
24
|
+
if not token:
|
|
25
|
+
raise ValueError("Config 'notion_token' is required.")
|
|
26
|
+
|
|
27
|
+
headers = {
|
|
28
|
+
"Authorization": f"Bearer {token}",
|
|
29
|
+
"Content-Type": "application/json",
|
|
30
|
+
"Notion-Version": "2022-06-28",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if "notion://page/" in source:
|
|
34
|
+
page_id = source.split("notion://page/")[-1]
|
|
35
|
+
yield self._create_task(page_id, "Target Page", token)
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
if source == "notion://search":
|
|
39
|
+
url = "https://api.notion.com/v1/search"
|
|
40
|
+
payload = {"filter": {"value": "page", "property": "object"}}
|
|
41
|
+
|
|
42
|
+
response = requests.post(url, headers=headers, json=payload)
|
|
43
|
+
if response.status_code == 200:
|
|
44
|
+
results = response.json().get("results", [])
|
|
45
|
+
for page in results:
|
|
46
|
+
page_id = page["id"]
|
|
47
|
+
title = "Untitled"
|
|
48
|
+
props = page.get("properties", {})
|
|
49
|
+
for key, val in props.items():
|
|
50
|
+
if val.get("type") == "title":
|
|
51
|
+
titles = val.get("title", [])
|
|
52
|
+
if titles:
|
|
53
|
+
title = titles[0].get("plain_text", "")
|
|
54
|
+
break
|
|
55
|
+
|
|
56
|
+
yield self._create_task(page_id, title, token)
|
|
57
|
+
else:
|
|
58
|
+
raise RuntimeError(f"Notion Search Failed: {response.text}")
|
|
59
|
+
|
|
60
|
+
def _create_task(self, page_id: str, title: str, token: str) -> SayouTask:
|
|
61
|
+
return SayouTask(
|
|
62
|
+
uri=f"notion://page/{page_id}",
|
|
63
|
+
source_type="notion",
|
|
64
|
+
params={
|
|
65
|
+
"notion_token": token,
|
|
66
|
+
},
|
|
67
|
+
meta={
|
|
68
|
+
"source": "notion",
|
|
69
|
+
"page_id": page_id,
|
|
70
|
+
"title": title,
|
|
71
|
+
"filename": f"notion_{title}_{page_id[:8]}",
|
|
72
|
+
},
|
|
73
|
+
)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_fetcher import BaseFetcher
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
11
|
+
except ImportError:
|
|
12
|
+
YouTubeTranscriptApi = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_component("fetcher")
|
|
16
|
+
class YouTubeFetcher(BaseFetcher):
|
|
17
|
+
"""
|
|
18
|
+
Fetches raw YouTube transcript and metadata.
|
|
19
|
+
Returns a Dict containing 'transcript' (List) and 'video_meta' (Dict).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
component_name = "YouTubeFetcher"
|
|
23
|
+
SUPPORTED_TYPES = ["youtube"]
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def can_handle(cls, uri: str) -> float:
|
|
27
|
+
return 1.0 if uri.startswith("youtube://") else 0.0
|
|
28
|
+
|
|
29
|
+
def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
|
|
30
|
+
if not YouTubeTranscriptApi:
|
|
31
|
+
raise ImportError("Package 'youtube-transcript-api' is required.")
|
|
32
|
+
|
|
33
|
+
video_id = task.meta.get("video_id")
|
|
34
|
+
if not video_id:
|
|
35
|
+
video_id = task.uri.replace("youtube://", "")
|
|
36
|
+
|
|
37
|
+
video_meta = self._fetch_metadata(video_id)
|
|
38
|
+
|
|
39
|
+
transcript_data = []
|
|
40
|
+
try:
|
|
41
|
+
yt_api = YouTubeTranscriptApi()
|
|
42
|
+
transcript_list = yt_api.list(video_id)
|
|
43
|
+
transcript = transcript_list.find_transcript(["ko", "en", "en-US", "ja"])
|
|
44
|
+
transcript_data = transcript.fetch()
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
self._log(f"Transcript fetch failed for {video_id}: {e}", level="warning")
|
|
48
|
+
transcript_data = []
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
"content": transcript_data,
|
|
52
|
+
"meta": {
|
|
53
|
+
"source": "youtube",
|
|
54
|
+
"video_id": video_id,
|
|
55
|
+
"url": f"https://www.youtube.com/watch?v={video_id}",
|
|
56
|
+
**video_meta,
|
|
57
|
+
},
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def _fetch_metadata(self, video_id: str) -> Dict[str, Any]:
|
|
61
|
+
"""
|
|
62
|
+
Scrapes detailed metadata (Date, Views, Tags, Thumbnail) via requests.
|
|
63
|
+
"""
|
|
64
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
65
|
+
info = {
|
|
66
|
+
"title": f"YouTube_{video_id}",
|
|
67
|
+
"author": "Unknown",
|
|
68
|
+
"description": "",
|
|
69
|
+
"publish_date": "",
|
|
70
|
+
"view_count": 0,
|
|
71
|
+
"keywords": [],
|
|
72
|
+
"thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
|
73
|
+
"duration_seconds": 0,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
|
|
78
|
+
if res.status_code == 200:
|
|
79
|
+
html = res.text
|
|
80
|
+
import re
|
|
81
|
+
|
|
82
|
+
# 1. Title
|
|
83
|
+
title_match = re.search(
|
|
84
|
+
r'<meta property="og:title" content="(.*?)">', html
|
|
85
|
+
)
|
|
86
|
+
if title_match:
|
|
87
|
+
info["title"] = title_match.group(1).replace(" - YouTube", "")
|
|
88
|
+
|
|
89
|
+
# 2. Description
|
|
90
|
+
desc_match = re.search(
|
|
91
|
+
r'<meta property="og:description" content="(.*?)">', html
|
|
92
|
+
)
|
|
93
|
+
if desc_match:
|
|
94
|
+
info["description"] = desc_match.group(1)
|
|
95
|
+
|
|
96
|
+
# 3. Author (Channel Name)
|
|
97
|
+
author_match = re.search(
|
|
98
|
+
r'<link itemprop="name" content="(.*?)">', html
|
|
99
|
+
)
|
|
100
|
+
if author_match:
|
|
101
|
+
info["author"] = author_match.group(1)
|
|
102
|
+
|
|
103
|
+
# 4. Publish Date (ISO 8601 Format: YYYY-MM-DD)
|
|
104
|
+
date_match = re.search(
|
|
105
|
+
r'<meta itemprop="datePublished" content="(.*?)">', html
|
|
106
|
+
)
|
|
107
|
+
if date_match:
|
|
108
|
+
info["publish_date"] = date_match.group(1)
|
|
109
|
+
|
|
110
|
+
# 5. View Count (InteractionCount)
|
|
111
|
+
views_match = re.search(
|
|
112
|
+
r'<meta itemprop="interactionCount" content="(\d+)">', html
|
|
113
|
+
)
|
|
114
|
+
if views_match:
|
|
115
|
+
info["view_count"] = int(views_match.group(1))
|
|
116
|
+
|
|
117
|
+
# 6. Keywords (Tags)
|
|
118
|
+
tags_match = re.search(r'<meta name="keywords" content="(.*?)">', html)
|
|
119
|
+
if tags_match:
|
|
120
|
+
info["keywords"] = [
|
|
121
|
+
tag.strip() for tag in tags_match.group(1).split(",")
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
# 7. Duration (ISO 8601 Duration: PT1H30M...)
|
|
125
|
+
dur_match = re.search(
|
|
126
|
+
r'<meta itemprop="duration" content="(.*?)">', html
|
|
127
|
+
)
|
|
128
|
+
if dur_match:
|
|
129
|
+
info["duration_iso"] = dur_match.group(1)
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
self._log(f"Meta scraping warning: {e}", level="debug")
|
|
133
|
+
|
|
134
|
+
return info
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Iterator
|
|
3
|
+
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouTask
|
|
6
|
+
|
|
7
|
+
from ..interfaces.base_generator import BaseGenerator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_component("generator")
|
|
11
|
+
class YouTubeGenerator(BaseGenerator):
|
|
12
|
+
"""
|
|
13
|
+
Parses YouTube URLs/IDs and generates tasks.
|
|
14
|
+
Supports comma-separated inputs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
component_name = "YouTubeGenerator"
|
|
18
|
+
SUPPORTED_TYPES = ["youtube"]
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def can_handle(cls, source: str) -> float:
|
|
22
|
+
return 1.0 if source.startswith("youtube://") else 0.0
|
|
23
|
+
|
|
24
|
+
def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
|
|
25
|
+
"""
|
|
26
|
+
Input: "https://youtu.be/xyz123, youtube://abc456"
|
|
27
|
+
Output: SayouTask(uri="youtube://xyz123")
|
|
28
|
+
"""
|
|
29
|
+
raw_source = source.replace("youtube://", "")
|
|
30
|
+
items = [item.strip() for item in raw_source.split(",")]
|
|
31
|
+
|
|
32
|
+
for item in items:
|
|
33
|
+
video_id = self._extract_video_id(item)
|
|
34
|
+
if video_id:
|
|
35
|
+
yield SayouTask(
|
|
36
|
+
uri=f"youtube://{video_id}",
|
|
37
|
+
source_type="youtube",
|
|
38
|
+
meta={
|
|
39
|
+
"source": "youtube",
|
|
40
|
+
"video_id": video_id,
|
|
41
|
+
"filename": f"youtube_{video_id}",
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def _extract_video_id(self, url_or_id: str) -> str:
|
|
46
|
+
if len(url_or_id) == 11 and re.match(r"^[a-zA-Z0-9_-]{11}$", url_or_id):
|
|
47
|
+
return url_or_id
|
|
48
|
+
|
|
49
|
+
patterns = [
|
|
50
|
+
r"(?:v=|\/)([0-9A-Za-z_-]{11}).*",
|
|
51
|
+
r"(?:youtu\.be\/)([0-9A-Za-z_-]{11})",
|
|
52
|
+
r"(?:embed\/)([0-9A-Za-z_-]{11})",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
for pattern in patterns:
|
|
56
|
+
match = re.search(pattern, url_or_id)
|
|
57
|
+
if match:
|
|
58
|
+
return match.group(1)
|
|
59
|
+
|
|
60
|
+
return None
|