sayou-connector 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,301 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ import requests
5
+ from sayou.core.registry import register_component
6
+ from sayou.core.schemas import SayouTask
7
+
8
+ from ..interfaces.base_fetcher import BaseFetcher
9
+
10
+
11
+ @register_component("fetcher")
12
+ class NotionFetcher(BaseFetcher):
13
+ """
14
+ Fetches Notion content.
15
+ Supports:
16
+ 1. Pages (Text, Media, Simple Tables)
17
+ 2. Inline Databases (Databases embedded inside pages)
18
+ 3. Full Databases (URL with ?v=...)
19
+ """
20
+
21
+ component_name = "NotionFetcher"
22
+ SUPPORTED_TYPES = ["notion"]
23
+
24
+ UUID_PATTERN = re.compile(
25
+ r"[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}",
26
+ re.IGNORECASE,
27
+ )
28
+
29
+ @classmethod
30
+ def can_handle(cls, uri: str) -> float:
31
+ if uri.startswith("notion://"):
32
+ return 1.0
33
+ if "notion.so" in uri or "notion.site" in uri:
34
+ return 1.0
35
+ return 0.0
36
+
37
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
38
+ token = task.params.get("notion_token")
39
+ if not token:
40
+ raise ValueError("[NotionFetcher] 'notion_token' is required.")
41
+
42
+ self.headers = {
43
+ "Authorization": f"Bearer {token}",
44
+ "Content-Type": "application/json",
45
+ "Notion-Version": "2022-06-28",
46
+ }
47
+
48
+ resource_id = self._extract_id(task.uri)
49
+ if not resource_id:
50
+ raise ValueError(f"Invalid Notion ID in URI: {task.uri}")
51
+
52
+ try:
53
+ return self._fetch_as_page(resource_id)
54
+ except RuntimeError:
55
+ self._log(
56
+ f"ID {resource_id} is not a Page. Trying Database...", level="debug"
57
+ )
58
+ return self._fetch_as_database_root(resource_id)
59
+
60
+ # --- Mode A: Page (Recursive) ---
61
+ def _fetch_as_page(self, page_id: str) -> Dict[str, Any]:
62
+ url = f"https://api.notion.com/v1/pages/{page_id}"
63
+ resp = requests.get(url, headers=self.headers)
64
+
65
+ if resp.status_code != 200:
66
+ raise RuntimeError(f"Status {resp.status_code}")
67
+
68
+ page_meta = resp.json()
69
+ title = self._extract_title_prop(page_meta)
70
+
71
+ md_content = f"# {title}\n\n"
72
+ root_blocks = self._get_children_recursive(page_id)
73
+ for block in root_blocks:
74
+ md_content += self._block_to_markdown(block) + "\n"
75
+
76
+ return {
77
+ "content": md_content,
78
+ "meta": {
79
+ "source": "notion",
80
+ "type": "page",
81
+ "page_id": page_id,
82
+ "title": title,
83
+ "url": page_meta.get("url", ""),
84
+ "filename": f"{self._sanitize_filename(title)}.md",
85
+ },
86
+ }
87
+
88
+ # --- Mode B: Database (Root) ---
89
+ def _fetch_as_database_root(self, db_id: str) -> Dict[str, Any]:
90
+ url = f"https://api.notion.com/v1/databases/{db_id}"
91
+ resp = requests.get(url, headers=self.headers)
92
+ if resp.status_code != 200:
93
+ raise RuntimeError(f"Access Failed: {db_id}")
94
+
95
+ db_meta = resp.json()
96
+ title = (
97
+ "".join([t.get("plain_text", "") for t in db_meta.get("title", [])])
98
+ or "Untitled DB"
99
+ )
100
+
101
+ md_table = self._query_and_render_database(db_id)
102
+
103
+ full_md = f"# [DB] {title}\n\n{md_table}"
104
+
105
+ return {
106
+ "content": full_md,
107
+ "meta": {
108
+ "source": "notion",
109
+ "type": "database",
110
+ "page_id": db_id,
111
+ "title": title,
112
+ "filename": f"DB_{self._sanitize_filename(title)}.md",
113
+ },
114
+ }
115
+
116
+ # --- Core Logic: Block to Markdown ---
117
+ def _block_to_markdown(self, block: Dict, indent: int = 0) -> str:
118
+ b_type = block.get("type")
119
+ prefix = " " * indent
120
+
121
+ # [Inline Database]
122
+ if b_type == "child_database":
123
+ db_id = block["id"]
124
+ db_title = block.get("child_database", {}).get("title", "Inline Database")
125
+ table_md = self._query_and_render_database(db_id)
126
+
127
+ return f"\n{prefix}### 📂 {db_title}\n{table_md}\n"
128
+
129
+ # [Simple Table]
130
+ if b_type == "table":
131
+ rows = block.get("children_data", [])
132
+ if not rows:
133
+ return ""
134
+ md_lines = []
135
+ has_header = block.get("table", {}).get("has_column_header", False)
136
+ for idx, row in enumerate(rows):
137
+ if row.get("type") == "table_row":
138
+ cells = row.get("table_row", {}).get("cells", [])
139
+ row_txt = "|" + "".join(
140
+ [
141
+ f" {''.join([t.get('plain_text', '') for t in c])} |"
142
+ for c in cells
143
+ ]
144
+ )
145
+ md_lines.append(f"{prefix}{row_txt}")
146
+ if has_header and idx == 0:
147
+ md_lines.append(
148
+ f"{prefix}|" + "|".join([" --- " for _ in cells]) + "|"
149
+ )
150
+ return "\n".join(md_lines) + "\n"
151
+
152
+ # Standard Blocks
153
+ content = block.get(b_type, {})
154
+ text = ""
155
+ if "rich_text" in content:
156
+ text = "".join(
157
+ [t.get("plain_text", "") for t in content.get("rich_text", [])]
158
+ )
159
+
160
+ md = ""
161
+ if b_type == "paragraph":
162
+ md = f"{prefix}{text}\n"
163
+ elif b_type.startswith("heading_"):
164
+ md = f"\n{prefix}{'#' * int(b_type[-1])} {text}\n"
165
+ elif b_type == "bulleted_list_item":
166
+ md = f"{prefix}- {text}"
167
+ elif b_type == "numbered_list_item":
168
+ md = f"{prefix}1. {text}"
169
+ elif b_type == "code":
170
+ lang = content.get("language", "text")
171
+ md = f"{prefix}```{lang}\n{prefix}{text}\n{prefix}```"
172
+ elif b_type == "image":
173
+ src = content.get("file", {}).get("url") or content.get("external", {}).get(
174
+ "url"
175
+ )
176
+ md = f"{prefix}![img]({src})"
177
+
178
+ # Recursive Children (except tables/dbs which handle their own data)
179
+ if block.get("children_data") and b_type not in ["table", "child_database"]:
180
+ next_indent = (
181
+ indent + 1
182
+ if b_type in ["toggle", "bulleted_list_item", "numbered_list_item"]
183
+ else indent
184
+ )
185
+ for child in block["children_data"]:
186
+ child_md = self._block_to_markdown(child, next_indent)
187
+ if child_md:
188
+ md += "\n" + child_md
189
+
190
+ return md
191
+
192
+ # --- Helper: Universal Database Renderer ---
193
+ def _query_and_render_database(self, db_id: str) -> str:
194
+ """
195
+ [핵심] 어떤 DB ID가 들어오든 내용을 조회(Query)해서 Markdown Table 문자열로 반환합니다.
196
+ Root DB Fetch와 Inline DB Fetch 양쪽에서 사용합니다.
197
+ """
198
+ items = []
199
+ query_url = f"https://api.notion.com/v1/databases/{db_id}/query"
200
+ has_more = True
201
+ next_cursor = None
202
+
203
+ # 1. Fetch All Items
204
+ while has_more:
205
+ payload = {
206
+ "page_size": 100,
207
+ # "sorts": [{"timestamp": "created_time", "direction": "ascending"}],
208
+ }
209
+ if next_cursor:
210
+ payload["start_cursor"] = next_cursor
211
+ r = requests.post(query_url, headers=self.headers, json=payload)
212
+ if r.status_code != 200:
213
+ self._log(f"DB Query Failed {db_id}: {r.text}", level="warning")
214
+ return "> ⚠️ Failed to load database content."
215
+
216
+ d = r.json()
217
+ items.extend(d.get("results", []))
218
+ has_more = d.get("has_more")
219
+ next_cursor = d.get("next_cursor")
220
+
221
+ if not items:
222
+ return "> (Empty Database)"
223
+
224
+ # 2. Extract Headers (from first item)
225
+ first_props = items[0].get("properties", {})
226
+ headers = list(first_props.keys())
227
+
228
+ # 3. Build Markdown Table
229
+ # Header
230
+ md = "| " + " | ".join(headers) + " |\n"
231
+ md += "| " + " | ".join(["---"] * len(headers)) + " |\n"
232
+
233
+ # Rows
234
+ for item in items:
235
+ row_cells = []
236
+ props = item.get("properties", {})
237
+ for col in headers:
238
+ val = props.get(col, {})
239
+ txt = self._extract_prop_value(val)
240
+ safe_txt = txt.replace("\n", " ").replace("|", "/")
241
+ row_cells.append(safe_txt)
242
+ md += "| " + " | ".join(row_cells) + " |\n"
243
+
244
+ return md
245
+
246
+ # --- Utils ---
247
+ def _extract_id(self, source: str) -> str:
248
+ if "?" in source:
249
+ source = source.split("?")[0]
250
+ match = self.UUID_PATTERN.search(source)
251
+ return match.group(0) if match else None
252
+
253
+ def _get_children_recursive(self, block_id: str) -> List[Dict]:
254
+ results = []
255
+ url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"
256
+ while url:
257
+ resp = requests.get(url, headers=self.headers)
258
+ if resp.status_code != 200:
259
+ break
260
+ data = resp.json()
261
+ for block in data.get("results", []):
262
+ if block.get("has_children"):
263
+ block["children_data"] = self._get_children_recursive(block["id"])
264
+ results.append(block)
265
+ url = (
266
+ f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100&start_cursor={data['next_cursor']}"
267
+ if data.get("has_more")
268
+ else None
269
+ )
270
+ return results
271
+
272
+ def _extract_title_prop(self, meta: Dict) -> str:
273
+ for v in meta.get("properties", {}).values():
274
+ if v.get("type") == "title":
275
+ return "".join([t.get("plain_text") for t in v.get("title", [])])
276
+ return "Untitled"
277
+
278
+ def _extract_prop_value(self, val: Dict) -> str:
279
+ t = val.get("type")
280
+ if t == "title":
281
+ return "".join([x.get("plain_text") for x in val.get("title", [])])
282
+ if t == "rich_text":
283
+ return "".join([x.get("plain_text") for x in val.get("rich_text", [])])
284
+ if t == "select":
285
+ return val.get("select", {}).get("name") or ""
286
+ if t == "status":
287
+ return val.get("status", {}).get("name") or ""
288
+ if t == "url":
289
+ return val.get("url") or ""
290
+ if t == "date":
291
+ return val.get("date", {}).get("start") or ""
292
+ if t == "checkbox":
293
+ return "Yes" if val.get("checkbox") else "No"
294
+ if t == "email":
295
+ return val.get("email") or ""
296
+ if t == "number":
297
+ return str(val.get("number") or "")
298
+ return ""
299
+
300
+ def _sanitize_filename(self, name: str) -> str:
301
+ return re.sub(r'[\\/*?:"<>|]', "", name).replace(" ", "_")
@@ -0,0 +1,73 @@
1
+ from typing import Iterator
2
+
3
+ import requests
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+
10
+ @register_component("generator")
11
+ class NotionGenerator(BaseGenerator):
12
+ """
13
+ Discovers Notion pages accessible by the integration token.
14
+ Supports:
15
+ - notion://search : Find all pages
16
+ - notion://page/{page_id} : Target specific page
17
+ """
18
+
19
+ component_name = "NotionGenerator"
20
+ SUPPORTED_TYPES = ["notion"]
21
+
22
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
23
+ token = kwargs.get("notion_token")
24
+ if not token:
25
+ raise ValueError("Config 'notion_token' is required.")
26
+
27
+ headers = {
28
+ "Authorization": f"Bearer {token}",
29
+ "Content-Type": "application/json",
30
+ "Notion-Version": "2022-06-28",
31
+ }
32
+
33
+ if "notion://page/" in source:
34
+ page_id = source.split("notion://page/")[-1]
35
+ yield self._create_task(page_id, "Target Page", token)
36
+ return
37
+
38
+ if source == "notion://search":
39
+ url = "https://api.notion.com/v1/search"
40
+ payload = {"filter": {"value": "page", "property": "object"}}
41
+
42
+ response = requests.post(url, headers=headers, json=payload)
43
+ if response.status_code == 200:
44
+ results = response.json().get("results", [])
45
+ for page in results:
46
+ page_id = page["id"]
47
+ title = "Untitled"
48
+ props = page.get("properties", {})
49
+ for key, val in props.items():
50
+ if val.get("type") == "title":
51
+ titles = val.get("title", [])
52
+ if titles:
53
+ title = titles[0].get("plain_text", "")
54
+ break
55
+
56
+ yield self._create_task(page_id, title, token)
57
+ else:
58
+ raise RuntimeError(f"Notion Search Failed: {response.text}")
59
+
60
+ def _create_task(self, page_id: str, title: str, token: str) -> SayouTask:
61
+ return SayouTask(
62
+ uri=f"notion://page/{page_id}",
63
+ source_type="notion",
64
+ params={
65
+ "notion_token": token,
66
+ },
67
+ meta={
68
+ "source": "notion",
69
+ "page_id": page_id,
70
+ "title": title,
71
+ "filename": f"notion_{title}_{page_id[:8]}",
72
+ },
73
+ )
@@ -0,0 +1,134 @@
1
+ from typing import Any, Dict
2
+
3
+ import requests
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_fetcher import BaseFetcher
8
+
9
+ try:
10
+ from youtube_transcript_api import YouTubeTranscriptApi
11
+ except ImportError:
12
+ YouTubeTranscriptApi = None
13
+
14
+
15
+ @register_component("fetcher")
16
+ class YouTubeFetcher(BaseFetcher):
17
+ """
18
+ Fetches raw YouTube transcript and metadata.
19
+ Returns a Dict containing 'transcript' (List) and 'video_meta' (Dict).
20
+ """
21
+
22
+ component_name = "YouTubeFetcher"
23
+ SUPPORTED_TYPES = ["youtube"]
24
+
25
+ @classmethod
26
+ def can_handle(cls, uri: str) -> float:
27
+ return 1.0 if uri.startswith("youtube://") else 0.0
28
+
29
+ def _do_fetch(self, task: SayouTask) -> Dict[str, Any]:
30
+ if not YouTubeTranscriptApi:
31
+ raise ImportError("Package 'youtube-transcript-api' is required.")
32
+
33
+ video_id = task.meta.get("video_id")
34
+ if not video_id:
35
+ video_id = task.uri.replace("youtube://", "")
36
+
37
+ video_meta = self._fetch_metadata(video_id)
38
+
39
+ transcript_data = []
40
+ try:
41
+ yt_api = YouTubeTranscriptApi()
42
+ transcript_list = yt_api.list(video_id)
43
+ transcript = transcript_list.find_transcript(["ko", "en", "en-US", "ja"])
44
+ transcript_data = transcript.fetch()
45
+
46
+ except Exception as e:
47
+ self._log(f"Transcript fetch failed for {video_id}: {e}", level="warning")
48
+ transcript_data = []
49
+
50
+ return {
51
+ "content": transcript_data,
52
+ "meta": {
53
+ "source": "youtube",
54
+ "video_id": video_id,
55
+ "url": f"https://www.youtube.com/watch?v={video_id}",
56
+ **video_meta,
57
+ },
58
+ }
59
+
60
+ def _fetch_metadata(self, video_id: str) -> Dict[str, Any]:
61
+ """
62
+ Scrapes detailed metadata (Date, Views, Tags, Thumbnail) via requests.
63
+ """
64
+ url = f"https://www.youtube.com/watch?v={video_id}"
65
+ info = {
66
+ "title": f"YouTube_{video_id}",
67
+ "author": "Unknown",
68
+ "description": "",
69
+ "publish_date": "",
70
+ "view_count": 0,
71
+ "keywords": [],
72
+ "thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
73
+ "duration_seconds": 0,
74
+ }
75
+
76
+ try:
77
+ res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
78
+ if res.status_code == 200:
79
+ html = res.text
80
+ import re
81
+
82
+ # 1. Title
83
+ title_match = re.search(
84
+ r'<meta property="og:title" content="(.*?)">', html
85
+ )
86
+ if title_match:
87
+ info["title"] = title_match.group(1).replace(" - YouTube", "")
88
+
89
+ # 2. Description
90
+ desc_match = re.search(
91
+ r'<meta property="og:description" content="(.*?)">', html
92
+ )
93
+ if desc_match:
94
+ info["description"] = desc_match.group(1)
95
+
96
+ # 3. Author (Channel Name)
97
+ author_match = re.search(
98
+ r'<link itemprop="name" content="(.*?)">', html
99
+ )
100
+ if author_match:
101
+ info["author"] = author_match.group(1)
102
+
103
+ # 4. Publish Date (ISO 8601 Format: YYYY-MM-DD)
104
+ date_match = re.search(
105
+ r'<meta itemprop="datePublished" content="(.*?)">', html
106
+ )
107
+ if date_match:
108
+ info["publish_date"] = date_match.group(1)
109
+
110
+ # 5. View Count (InteractionCount)
111
+ views_match = re.search(
112
+ r'<meta itemprop="interactionCount" content="(\d+)">', html
113
+ )
114
+ if views_match:
115
+ info["view_count"] = int(views_match.group(1))
116
+
117
+ # 6. Keywords (Tags)
118
+ tags_match = re.search(r'<meta name="keywords" content="(.*?)">', html)
119
+ if tags_match:
120
+ info["keywords"] = [
121
+ tag.strip() for tag in tags_match.group(1).split(",")
122
+ ]
123
+
124
+ # 7. Duration (ISO 8601 Duration: PT1H30M...)
125
+ dur_match = re.search(
126
+ r'<meta itemprop="duration" content="(.*?)">', html
127
+ )
128
+ if dur_match:
129
+ info["duration_iso"] = dur_match.group(1)
130
+
131
+ except Exception as e:
132
+ self._log(f"Meta scraping warning: {e}", level="debug")
133
+
134
+ return info
@@ -0,0 +1,60 @@
1
+ import re
2
+ from typing import Iterator
3
+
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouTask
6
+
7
+ from ..interfaces.base_generator import BaseGenerator
8
+
9
+
10
+ @register_component("generator")
11
+ class YouTubeGenerator(BaseGenerator):
12
+ """
13
+ Parses YouTube URLs/IDs and generates tasks.
14
+ Supports comma-separated inputs.
15
+ """
16
+
17
+ component_name = "YouTubeGenerator"
18
+ SUPPORTED_TYPES = ["youtube"]
19
+
20
+ @classmethod
21
+ def can_handle(cls, source: str) -> float:
22
+ return 1.0 if source.startswith("youtube://") else 0.0
23
+
24
+ def _do_generate(self, source: str, **kwargs) -> Iterator[SayouTask]:
25
+ """
26
+ Input: "https://youtu.be/xyz123, youtube://abc456"
27
+ Output: SayouTask(uri="youtube://xyz123")
28
+ """
29
+ raw_source = source.replace("youtube://", "")
30
+ items = [item.strip() for item in raw_source.split(",")]
31
+
32
+ for item in items:
33
+ video_id = self._extract_video_id(item)
34
+ if video_id:
35
+ yield SayouTask(
36
+ uri=f"youtube://{video_id}",
37
+ source_type="youtube",
38
+ meta={
39
+ "source": "youtube",
40
+ "video_id": video_id,
41
+ "filename": f"youtube_{video_id}",
42
+ },
43
+ )
44
+
45
+ def _extract_video_id(self, url_or_id: str) -> str:
46
+ if len(url_or_id) == 11 and re.match(r"^[a-zA-Z0-9_-]{11}$", url_or_id):
47
+ return url_or_id
48
+
49
+ patterns = [
50
+ r"(?:v=|\/)([0-9A-Za-z_-]{11}).*",
51
+ r"(?:youtu\.be\/)([0-9A-Za-z_-]{11})",
52
+ r"(?:embed\/)([0-9A-Za-z_-]{11})",
53
+ ]
54
+
55
+ for pattern in patterns:
56
+ match = re.search(pattern, url_or_id)
57
+ if match:
58
+ return match.group(1)
59
+
60
+ return None