aichat2md 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aichat2md-1.0.0 → aichat2md-1.1.0}/PKG-INFO +29 -5
- {aichat2md-1.0.0 → aichat2md-1.1.0}/README.md +25 -1
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/__init__.py +1 -1
- aichat2md-1.1.0/aichat2md/extractors/playwright_extractor.py +108 -0
- aichat2md-1.1.0/aichat2md/prompts/system_prompt_en.txt +50 -0
- aichat2md-1.1.0/aichat2md/prompts/system_prompt_zh.txt +50 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md.egg-info/PKG-INFO +29 -5
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md.egg-info/SOURCES.txt +2 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/pyproject.toml +7 -4
- aichat2md-1.0.0/aichat2md/extractors/playwright_extractor.py +0 -58
- {aichat2md-1.0.0 → aichat2md-1.1.0}/LICENSE +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/cli.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/config.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/extractors/__init__.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/extractors/webarchive_extractor.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/prompts/__init__.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md/structurizer.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md.egg-info/dependency_links.txt +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md.egg-info/entry_points.txt +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md.egg-info/requires.txt +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/aichat2md.egg-info/top_level.txt +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/setup.cfg +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/tests/test_cli.py +0 -0
- {aichat2md-1.0.0 → aichat2md-1.1.0}/tests/test_config.py +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aichat2md
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Convert AI chat conversations to structured Markdown
|
|
5
5
|
Author: PlaceNameDay
|
|
6
6
|
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/
|
|
8
|
-
Project-URL: Repository, https://github.com/
|
|
9
|
-
Project-URL: Issues, https://github.com/
|
|
7
|
+
Project-URL: Homepage, https://github.com/placenameday/aichat2md
|
|
8
|
+
Project-URL: Repository, https://github.com/placenameday/aichat2md
|
|
9
|
+
Project-URL: Issues, https://github.com/placenameday/aichat2md/issues
|
|
10
10
|
Keywords: chatgpt,claude,markdown,ai,converter,deepseek
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
@@ -32,7 +32,7 @@ Convert AI chat conversations to structured Markdown documents.
|
|
|
32
32
|
|
|
33
33
|
## Features
|
|
34
34
|
|
|
35
|
-
- 🌐 **Extract from URLs** - ChatGPT share links (with JS rendering via Playwright)
|
|
35
|
+
- 🌐 **Extract from URLs** - ChatGPT, Gemini, Doubao share links (with JS rendering via Playwright)
|
|
36
36
|
- 📄 **Extract from webarchive** - Safari .webarchive files (offline mode)
|
|
37
37
|
- 🤖 **Multiple AI backends** - DeepSeek, OpenAI, Groq, or any OpenAI-compatible API
|
|
38
38
|
- 🌍 **Bilingual support** - English/Chinese prompts
|
|
@@ -55,6 +55,30 @@ aichat2md https://chatgpt.com/share/xxx
|
|
|
55
55
|
aichat2md ~/Downloads/chat.webarchive
|
|
56
56
|
```
|
|
57
57
|
|
|
58
|
+
## Supported Platforms
|
|
59
|
+
|
|
60
|
+
- **ChatGPT** - chatgpt.com share links
|
|
61
|
+
- **Gemini** - gemini.google.com or g.co share links
|
|
62
|
+
- **Doubao (豆包)** - doubao.com share links
|
|
63
|
+
- **Webarchive** - Safari exported .webarchive files (any platform)
|
|
64
|
+
|
|
65
|
+
### Usage Examples
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# ChatGPT
|
|
69
|
+
aichat2md https://chatgpt.com/share/xxx
|
|
70
|
+
|
|
71
|
+
# Gemini (supports both long and short URLs)
|
|
72
|
+
aichat2md https://gemini.google.com/share/xxx
|
|
73
|
+
aichat2md https://g.co/gemini/share/xxx
|
|
74
|
+
|
|
75
|
+
# Doubao
|
|
76
|
+
aichat2md https://www.doubao.com/thread/xxx
|
|
77
|
+
|
|
78
|
+
# Webarchive file
|
|
79
|
+
aichat2md ~/Downloads/conversation.webarchive
|
|
80
|
+
```
|
|
81
|
+
|
|
58
82
|
## Supported AI Backends
|
|
59
83
|
|
|
60
84
|
- **DeepSeek** (default) - Cost-effective, Chinese service
|
|
@@ -4,7 +4,7 @@ Convert AI chat conversations to structured Markdown documents.
|
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
- 🌐 **Extract from URLs** - ChatGPT share links (with JS rendering via Playwright)
|
|
7
|
+
- 🌐 **Extract from URLs** - ChatGPT, Gemini, Doubao share links (with JS rendering via Playwright)
|
|
8
8
|
- 📄 **Extract from webarchive** - Safari .webarchive files (offline mode)
|
|
9
9
|
- 🤖 **Multiple AI backends** - DeepSeek, OpenAI, Groq, or any OpenAI-compatible API
|
|
10
10
|
- 🌍 **Bilingual support** - English/Chinese prompts
|
|
@@ -27,6 +27,30 @@ aichat2md https://chatgpt.com/share/xxx
|
|
|
27
27
|
aichat2md ~/Downloads/chat.webarchive
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
## Supported Platforms
|
|
31
|
+
|
|
32
|
+
- **ChatGPT** - chatgpt.com share links
|
|
33
|
+
- **Gemini** - gemini.google.com or g.co share links
|
|
34
|
+
- **Doubao (豆包)** - doubao.com share links
|
|
35
|
+
- **Webarchive** - Safari exported .webarchive files (any platform)
|
|
36
|
+
|
|
37
|
+
### Usage Examples
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# ChatGPT
|
|
41
|
+
aichat2md https://chatgpt.com/share/xxx
|
|
42
|
+
|
|
43
|
+
# Gemini (supports both long and short URLs)
|
|
44
|
+
aichat2md https://gemini.google.com/share/xxx
|
|
45
|
+
aichat2md https://g.co/gemini/share/xxx
|
|
46
|
+
|
|
47
|
+
# Doubao
|
|
48
|
+
aichat2md https://www.doubao.com/thread/xxx
|
|
49
|
+
|
|
50
|
+
# Webarchive file
|
|
51
|
+
aichat2md ~/Downloads/conversation.webarchive
|
|
52
|
+
```
|
|
53
|
+
|
|
30
54
|
## Supported AI Backends
|
|
31
55
|
|
|
32
56
|
- **DeepSeek** (default) - Cost-effective, Chinese service
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Extract content from AI chat share URLs using Playwright."""
|
|
2
|
+
|
|
3
|
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _detect_platform(url: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Detect platform from URL.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
url: Share URL
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Platform name: 'doubao', 'gemini', or 'default'
|
|
15
|
+
"""
|
|
16
|
+
url_lower = url.lower()
|
|
17
|
+
if 'doubao.com' in url_lower:
|
|
18
|
+
return 'doubao'
|
|
19
|
+
elif 'gemini.google.com' in url_lower or 'g.co' in url_lower:
|
|
20
|
+
return 'gemini'
|
|
21
|
+
else:
|
|
22
|
+
return 'default'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_wait_time(platform: str) -> int:
|
|
26
|
+
"""
|
|
27
|
+
Get wait time in milliseconds for platform.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
platform: Platform name from _detect_platform
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Wait time in milliseconds
|
|
34
|
+
"""
|
|
35
|
+
wait_times = {
|
|
36
|
+
'doubao': 3000,
|
|
37
|
+
'gemini': 5000,
|
|
38
|
+
'default': 2000
|
|
39
|
+
}
|
|
40
|
+
return wait_times.get(platform, 2000)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def extract_from_url(url: str, timeout: int = 30000) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Extract text content from AI chat share URL.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
url: Share URL (ChatGPT, Gemini, Doubao, etc.)
|
|
49
|
+
timeout: Page load timeout in milliseconds
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Extracted plain text content
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
PlaywrightTimeoutError: If page fails to load
|
|
56
|
+
ValueError: If URL is invalid
|
|
57
|
+
"""
|
|
58
|
+
if not url.startswith('http'):
|
|
59
|
+
raise ValueError(f"Invalid URL: {url}")
|
|
60
|
+
|
|
61
|
+
# Detect platform and get corresponding wait time
|
|
62
|
+
platform = _detect_platform(url)
|
|
63
|
+
wait_time = _get_wait_time(platform)
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
with sync_playwright() as p:
|
|
67
|
+
browser = p.chromium.launch(headless=True)
|
|
68
|
+
page = browser.new_page()
|
|
69
|
+
|
|
70
|
+
# Navigate with appropriate wait strategy
|
|
71
|
+
# Use 'load' for Gemini/Doubao (networkidle may timeout due to ongoing requests)
|
|
72
|
+
wait_strategy = 'load' if platform in ['gemini', 'doubao'] else 'networkidle'
|
|
73
|
+
page.goto(url, wait_until=wait_strategy, timeout=timeout)
|
|
74
|
+
|
|
75
|
+
# Wait for content to load
|
|
76
|
+
# Try to wait for main selector (works for ChatGPT)
|
|
77
|
+
try:
|
|
78
|
+
page.wait_for_selector('main', timeout=10000)
|
|
79
|
+
except PlaywrightTimeoutError:
|
|
80
|
+
# Some platforms may not have 'main' element, continue anyway
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
# Additional wait for dynamic content based on platform
|
|
84
|
+
page.wait_for_timeout(wait_time)
|
|
85
|
+
|
|
86
|
+
# Extract plain text from body
|
|
87
|
+
content = page.inner_text('body')
|
|
88
|
+
|
|
89
|
+
browser.close()
|
|
90
|
+
|
|
91
|
+
return content.strip()
|
|
92
|
+
|
|
93
|
+
except PlaywrightTimeoutError as e:
|
|
94
|
+
raise PlaywrightTimeoutError(
|
|
95
|
+
f"Failed to load page within {timeout}ms. "
|
|
96
|
+
"Check your network connection and URL validity."
|
|
97
|
+
) from e
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
# Manual test
|
|
102
|
+
import sys
|
|
103
|
+
if len(sys.argv) > 1:
|
|
104
|
+
url = sys.argv[1]
|
|
105
|
+
print(f"Extracting from: {url}")
|
|
106
|
+
content = extract_from_url(url)
|
|
107
|
+
print(f"Extracted {len(content)} characters")
|
|
108
|
+
print(content[:500])
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
You are a professional knowledge document editor.
|
|
2
|
+
|
|
3
|
+
Input: Raw text from an AI chat conversation
|
|
4
|
+
Output: Structured Markdown document
|
|
5
|
+
|
|
6
|
+
Requirements:
|
|
7
|
+
1. Identify the main topic and generate a concise document title
|
|
8
|
+
2. Extract technical tags (e.g., [Python, API, Web]), limit to 3-5 tags
|
|
9
|
+
3. Write a summary (2-3 sentences covering core content)
|
|
10
|
+
4. Reorganize the conversation into knowledge sections with logical headings (## and ###)
|
|
11
|
+
5. Filter out conversational filler ("OK", "let me think", "thanks", etc.)
|
|
12
|
+
6. Extract code blocks and present them separately in "Code Examples" section
|
|
13
|
+
7. Do NOT preserve chat turn format (User/Assistant) - rewrite as flowing explanatory content
|
|
14
|
+
8. Identify key topics and list them as bullet points
|
|
15
|
+
|
|
16
|
+
Output format (strictly follow):
|
|
17
|
+
---
|
|
18
|
+
tags: [tag1, tag2, tag3]
|
|
19
|
+
date: YYYY-MM-DD
|
|
20
|
+
source: [original URL or filename]
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
# Document Title
|
|
24
|
+
|
|
25
|
+
## Summary
|
|
26
|
+
[2-3 sentences covering core content]
|
|
27
|
+
|
|
28
|
+
## Key Topics
|
|
29
|
+
- Topic 1
|
|
30
|
+
- Topic 2
|
|
31
|
+
- Topic 3
|
|
32
|
+
|
|
33
|
+
## [Knowledge Section Title 1]
|
|
34
|
+
[Reorganized explanatory content, not conversational format]
|
|
35
|
+
|
|
36
|
+
### [Subsection Title]
|
|
37
|
+
[Detailed content]
|
|
38
|
+
|
|
39
|
+
## [Knowledge Section Title 2]
|
|
40
|
+
[Content...]
|
|
41
|
+
|
|
42
|
+
## Code Examples
|
|
43
|
+
```language
|
|
44
|
+
# Complete code with explanatory comments
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Notes:
|
|
48
|
+
- Titles should be specific and informative, avoid generic titles like "Introduction", "Overview"
|
|
49
|
+
- Content should flow smoothly, like a tutorial or documentation rather than chat logs
|
|
50
|
+
- Code examples should be complete and runnable, with necessary comments
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
你是一个专业的知识文档编辑器。
|
|
2
|
+
|
|
3
|
+
输入:ChatGPT 对话的原始文本
|
|
4
|
+
输出:结构化的 Markdown 文档
|
|
5
|
+
|
|
6
|
+
要求:
|
|
7
|
+
1. 识别对话主题,生成简洁的文档标题
|
|
8
|
+
2. 提取技术标签(如 [Python, API, Web]),限制在 3-5 个标签
|
|
9
|
+
3. 写一段摘要(2-3 句话概括核心内容)
|
|
10
|
+
4. 将对话重组为知识章节,使用合理的标题层级(## 和 ###)
|
|
11
|
+
5. 过滤无用的对话废话("好的"、"让我想想"、"谢谢"等)
|
|
12
|
+
6. 提取代码块单独呈现在"代码示例"章节
|
|
13
|
+
7. 不要保留对话轮次格式(User/Assistant),重组为流畅的说明性内容
|
|
14
|
+
8. 识别关键主题,列为项目符号列表
|
|
15
|
+
|
|
16
|
+
输出格式(严格遵循):
|
|
17
|
+
---
|
|
18
|
+
技术标签: [标签1, 标签2, 标签3]
|
|
19
|
+
日期: YYYY-MM-DD
|
|
20
|
+
来源: [原始URL或文件名]
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
# 文档标题
|
|
24
|
+
|
|
25
|
+
## 摘要
|
|
26
|
+
[2-3 句话概括核心内容]
|
|
27
|
+
|
|
28
|
+
## 关键主题
|
|
29
|
+
- 主题 1
|
|
30
|
+
- 主题 2
|
|
31
|
+
- 主题 3
|
|
32
|
+
|
|
33
|
+
## [知识点章节标题 1]
|
|
34
|
+
[重组后的说明性内容,非对话格式]
|
|
35
|
+
|
|
36
|
+
### [子章节标题]
|
|
37
|
+
[详细内容]
|
|
38
|
+
|
|
39
|
+
## [知识点章节标题 2]
|
|
40
|
+
[内容...]
|
|
41
|
+
|
|
42
|
+
## 代码示例
|
|
43
|
+
```语言
|
|
44
|
+
# 提取的完整代码,带注释说明
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
注意:
|
|
48
|
+
- 标题要具体且有信息量,避免"介绍"、"概述"等泛泛标题
|
|
49
|
+
- 内容要连贯流畅,像教程或文档而非聊天记录
|
|
50
|
+
- 代码示例要完整可运行,添加必要注释
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aichat2md
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Convert AI chat conversations to structured Markdown
|
|
5
5
|
Author: PlaceNameDay
|
|
6
6
|
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/
|
|
8
|
-
Project-URL: Repository, https://github.com/
|
|
9
|
-
Project-URL: Issues, https://github.com/
|
|
7
|
+
Project-URL: Homepage, https://github.com/placenameday/aichat2md
|
|
8
|
+
Project-URL: Repository, https://github.com/placenameday/aichat2md
|
|
9
|
+
Project-URL: Issues, https://github.com/placenameday/aichat2md/issues
|
|
10
10
|
Keywords: chatgpt,claude,markdown,ai,converter,deepseek
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
@@ -32,7 +32,7 @@ Convert AI chat conversations to structured Markdown documents.
|
|
|
32
32
|
|
|
33
33
|
## Features
|
|
34
34
|
|
|
35
|
-
- 🌐 **Extract from URLs** - ChatGPT share links (with JS rendering via Playwright)
|
|
35
|
+
- 🌐 **Extract from URLs** - ChatGPT, Gemini, Doubao share links (with JS rendering via Playwright)
|
|
36
36
|
- 📄 **Extract from webarchive** - Safari .webarchive files (offline mode)
|
|
37
37
|
- 🤖 **Multiple AI backends** - DeepSeek, OpenAI, Groq, or any OpenAI-compatible API
|
|
38
38
|
- 🌍 **Bilingual support** - English/Chinese prompts
|
|
@@ -55,6 +55,30 @@ aichat2md https://chatgpt.com/share/xxx
|
|
|
55
55
|
aichat2md ~/Downloads/chat.webarchive
|
|
56
56
|
```
|
|
57
57
|
|
|
58
|
+
## Supported Platforms
|
|
59
|
+
|
|
60
|
+
- **ChatGPT** - chatgpt.com share links
|
|
61
|
+
- **Gemini** - gemini.google.com or g.co share links
|
|
62
|
+
- **Doubao (豆包)** - doubao.com share links
|
|
63
|
+
- **Webarchive** - Safari exported .webarchive files (any platform)
|
|
64
|
+
|
|
65
|
+
### Usage Examples
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# ChatGPT
|
|
69
|
+
aichat2md https://chatgpt.com/share/xxx
|
|
70
|
+
|
|
71
|
+
# Gemini (supports both long and short URLs)
|
|
72
|
+
aichat2md https://gemini.google.com/share/xxx
|
|
73
|
+
aichat2md https://g.co/gemini/share/xxx
|
|
74
|
+
|
|
75
|
+
# Doubao
|
|
76
|
+
aichat2md https://www.doubao.com/thread/xxx
|
|
77
|
+
|
|
78
|
+
# Webarchive file
|
|
79
|
+
aichat2md ~/Downloads/conversation.webarchive
|
|
80
|
+
```
|
|
81
|
+
|
|
58
82
|
## Supported AI Backends
|
|
59
83
|
|
|
60
84
|
- **DeepSeek** (default) - Cost-effective, Chinese service
|
|
@@ -15,5 +15,7 @@ aichat2md/extractors/__init__.py
|
|
|
15
15
|
aichat2md/extractors/playwright_extractor.py
|
|
16
16
|
aichat2md/extractors/webarchive_extractor.py
|
|
17
17
|
aichat2md/prompts/__init__.py
|
|
18
|
+
aichat2md/prompts/system_prompt_en.txt
|
|
19
|
+
aichat2md/prompts/system_prompt_zh.txt
|
|
18
20
|
tests/test_cli.py
|
|
19
21
|
tests/test_config.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "aichat2md"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.1.0"
|
|
8
8
|
description = "Convert AI chat conversations to structured Markdown"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -34,6 +34,9 @@ dependencies = [
|
|
|
34
34
|
aichat2md = "aichat2md.cli:main"
|
|
35
35
|
|
|
36
36
|
[project.urls]
|
|
37
|
-
Homepage = "https://github.com/
|
|
38
|
-
Repository = "https://github.com/
|
|
39
|
-
Issues = "https://github.com/
|
|
37
|
+
Homepage = "https://github.com/placenameday/aichat2md"
|
|
38
|
+
Repository = "https://github.com/placenameday/aichat2md"
|
|
39
|
+
Issues = "https://github.com/placenameday/aichat2md/issues"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.package-data]
|
|
42
|
+
aichat2md = ["prompts/*.txt"]
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
"""Extract content from ChatGPT share URLs using Playwright."""
|
|
2
|
-
|
|
3
|
-
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def extract_from_url(url: str, timeout: int = 30000) -> str:
|
|
7
|
-
"""
|
|
8
|
-
Extract text content from ChatGPT share URL.
|
|
9
|
-
|
|
10
|
-
Args:
|
|
11
|
-
url: ChatGPT share URL (e.g., https://chatgpt.com/share/...)
|
|
12
|
-
timeout: Page load timeout in milliseconds
|
|
13
|
-
|
|
14
|
-
Returns:
|
|
15
|
-
Extracted plain text content
|
|
16
|
-
|
|
17
|
-
Raises:
|
|
18
|
-
PlaywrightTimeoutError: If page fails to load
|
|
19
|
-
ValueError: If URL is invalid
|
|
20
|
-
"""
|
|
21
|
-
if not url.startswith('http'):
|
|
22
|
-
raise ValueError(f"Invalid URL: {url}")
|
|
23
|
-
|
|
24
|
-
try:
|
|
25
|
-
with sync_playwright() as p:
|
|
26
|
-
browser = p.chromium.launch(headless=True)
|
|
27
|
-
page = browser.new_page()
|
|
28
|
-
|
|
29
|
-
# Navigate and wait for network idle
|
|
30
|
-
page.goto(url, wait_until='networkidle', timeout=timeout)
|
|
31
|
-
|
|
32
|
-
# Wait for conversation content to load
|
|
33
|
-
# ChatGPT share pages typically have conversation in main content area
|
|
34
|
-
page.wait_for_selector('main', timeout=10000)
|
|
35
|
-
|
|
36
|
-
# Extract plain text from body
|
|
37
|
-
content = page.inner_text('body')
|
|
38
|
-
|
|
39
|
-
browser.close()
|
|
40
|
-
|
|
41
|
-
return content.strip()
|
|
42
|
-
|
|
43
|
-
except PlaywrightTimeoutError as e:
|
|
44
|
-
raise PlaywrightTimeoutError(
|
|
45
|
-
f"Failed to load page within {timeout}ms. "
|
|
46
|
-
"Check your network connection and URL validity."
|
|
47
|
-
) from e
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if __name__ == "__main__":
|
|
51
|
-
# Manual test
|
|
52
|
-
import sys
|
|
53
|
-
if len(sys.argv) > 1:
|
|
54
|
-
url = sys.argv[1]
|
|
55
|
-
print(f"Extracting from: {url}")
|
|
56
|
-
content = extract_from_url(url)
|
|
57
|
-
print(f"Extracted {len(content)} characters")
|
|
58
|
-
print(content[:500])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|