aichat2md 1.0.1__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aichat2md-1.0.1 → aichat2md-1.1.0}/PKG-INFO +26 -2
- {aichat2md-1.0.1 → aichat2md-1.1.0}/README.md +25 -1
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/__init__.py +1 -1
- aichat2md-1.1.0/aichat2md/extractors/playwright_extractor.py +108 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md.egg-info/PKG-INFO +26 -2
- {aichat2md-1.0.1 → aichat2md-1.1.0}/pyproject.toml +1 -1
- aichat2md-1.0.1/aichat2md/extractors/playwright_extractor.py +0 -58
- {aichat2md-1.0.1 → aichat2md-1.1.0}/LICENSE +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/cli.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/config.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/extractors/__init__.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/extractors/webarchive_extractor.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/prompts/__init__.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/prompts/system_prompt_en.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/prompts/system_prompt_zh.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md/structurizer.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md.egg-info/SOURCES.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md.egg-info/dependency_links.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md.egg-info/entry_points.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md.egg-info/requires.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/aichat2md.egg-info/top_level.txt +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/setup.cfg +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/tests/test_cli.py +0 -0
- {aichat2md-1.0.1 → aichat2md-1.1.0}/tests/test_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aichat2md
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Convert AI chat conversations to structured Markdown
|
|
5
5
|
Author: PlaceNameDay
|
|
6
6
|
License: MIT
|
|
@@ -32,7 +32,7 @@ Convert AI chat conversations to structured Markdown documents.
|
|
|
32
32
|
|
|
33
33
|
## Features
|
|
34
34
|
|
|
35
|
-
- 🌐 **Extract from URLs** - ChatGPT share links (with JS rendering via Playwright)
|
|
35
|
+
- 🌐 **Extract from URLs** - ChatGPT, Gemini, Doubao share links (with JS rendering via Playwright)
|
|
36
36
|
- 📄 **Extract from webarchive** - Safari .webarchive files (offline mode)
|
|
37
37
|
- 🤖 **Multiple AI backends** - DeepSeek, OpenAI, Groq, or any OpenAI-compatible API
|
|
38
38
|
- 🌍 **Bilingual support** - English/Chinese prompts
|
|
@@ -55,6 +55,30 @@ aichat2md https://chatgpt.com/share/xxx
|
|
|
55
55
|
aichat2md ~/Downloads/chat.webarchive
|
|
56
56
|
```
|
|
57
57
|
|
|
58
|
+
## Supported Platforms
|
|
59
|
+
|
|
60
|
+
- **ChatGPT** - chatgpt.com share links
|
|
61
|
+
- **Gemini** - gemini.google.com or g.co share links
|
|
62
|
+
- **Doubao (豆包)** - doubao.com share links
|
|
63
|
+
- **Webarchive** - Safari exported .webarchive files (any platform)
|
|
64
|
+
|
|
65
|
+
### Usage Examples
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# ChatGPT
|
|
69
|
+
aichat2md https://chatgpt.com/share/xxx
|
|
70
|
+
|
|
71
|
+
# Gemini (supports both long and short URLs)
|
|
72
|
+
aichat2md https://gemini.google.com/share/xxx
|
|
73
|
+
aichat2md https://g.co/gemini/share/xxx
|
|
74
|
+
|
|
75
|
+
# Doubao
|
|
76
|
+
aichat2md https://www.doubao.com/thread/xxx
|
|
77
|
+
|
|
78
|
+
# Webarchive file
|
|
79
|
+
aichat2md ~/Downloads/conversation.webarchive
|
|
80
|
+
```
|
|
81
|
+
|
|
58
82
|
## Supported AI Backends
|
|
59
83
|
|
|
60
84
|
- **DeepSeek** (default) - Cost-effective, Chinese service
|
|
@@ -4,7 +4,7 @@ Convert AI chat conversations to structured Markdown documents.
|
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
- 🌐 **Extract from URLs** - ChatGPT share links (with JS rendering via Playwright)
|
|
7
|
+
- 🌐 **Extract from URLs** - ChatGPT, Gemini, Doubao share links (with JS rendering via Playwright)
|
|
8
8
|
- 📄 **Extract from webarchive** - Safari .webarchive files (offline mode)
|
|
9
9
|
- 🤖 **Multiple AI backends** - DeepSeek, OpenAI, Groq, or any OpenAI-compatible API
|
|
10
10
|
- 🌍 **Bilingual support** - English/Chinese prompts
|
|
@@ -27,6 +27,30 @@ aichat2md https://chatgpt.com/share/xxx
|
|
|
27
27
|
aichat2md ~/Downloads/chat.webarchive
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
## Supported Platforms
|
|
31
|
+
|
|
32
|
+
- **ChatGPT** - chatgpt.com share links
|
|
33
|
+
- **Gemini** - gemini.google.com or g.co share links
|
|
34
|
+
- **Doubao (豆包)** - doubao.com share links
|
|
35
|
+
- **Webarchive** - Safari exported .webarchive files (any platform)
|
|
36
|
+
|
|
37
|
+
### Usage Examples
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# ChatGPT
|
|
41
|
+
aichat2md https://chatgpt.com/share/xxx
|
|
42
|
+
|
|
43
|
+
# Gemini (supports both long and short URLs)
|
|
44
|
+
aichat2md https://gemini.google.com/share/xxx
|
|
45
|
+
aichat2md https://g.co/gemini/share/xxx
|
|
46
|
+
|
|
47
|
+
# Doubao
|
|
48
|
+
aichat2md https://www.doubao.com/thread/xxx
|
|
49
|
+
|
|
50
|
+
# Webarchive file
|
|
51
|
+
aichat2md ~/Downloads/conversation.webarchive
|
|
52
|
+
```
|
|
53
|
+
|
|
30
54
|
## Supported AI Backends
|
|
31
55
|
|
|
32
56
|
- **DeepSeek** (default) - Cost-effective, Chinese service
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Extract content from AI chat share URLs using Playwright."""
|
|
2
|
+
|
|
3
|
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _detect_platform(url: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Detect platform from URL.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
url: Share URL
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Platform name: 'doubao', 'gemini', or 'default'
|
|
15
|
+
"""
|
|
16
|
+
url_lower = url.lower()
|
|
17
|
+
if 'doubao.com' in url_lower:
|
|
18
|
+
return 'doubao'
|
|
19
|
+
elif 'gemini.google.com' in url_lower or 'g.co' in url_lower:
|
|
20
|
+
return 'gemini'
|
|
21
|
+
else:
|
|
22
|
+
return 'default'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_wait_time(platform: str) -> int:
|
|
26
|
+
"""
|
|
27
|
+
Get wait time in milliseconds for platform.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
platform: Platform name from _detect_platform
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Wait time in milliseconds
|
|
34
|
+
"""
|
|
35
|
+
wait_times = {
|
|
36
|
+
'doubao': 3000,
|
|
37
|
+
'gemini': 5000,
|
|
38
|
+
'default': 2000
|
|
39
|
+
}
|
|
40
|
+
return wait_times.get(platform, 2000)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def extract_from_url(url: str, timeout: int = 30000) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Extract text content from AI chat share URL.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
url: Share URL (ChatGPT, Gemini, Doubao, etc.)
|
|
49
|
+
timeout: Page load timeout in milliseconds
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Extracted plain text content
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
PlaywrightTimeoutError: If page fails to load
|
|
56
|
+
ValueError: If URL is invalid
|
|
57
|
+
"""
|
|
58
|
+
if not url.startswith('http'):
|
|
59
|
+
raise ValueError(f"Invalid URL: {url}")
|
|
60
|
+
|
|
61
|
+
# Detect platform and get corresponding wait time
|
|
62
|
+
platform = _detect_platform(url)
|
|
63
|
+
wait_time = _get_wait_time(platform)
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
with sync_playwright() as p:
|
|
67
|
+
browser = p.chromium.launch(headless=True)
|
|
68
|
+
page = browser.new_page()
|
|
69
|
+
|
|
70
|
+
# Navigate with appropriate wait strategy
|
|
71
|
+
# Use 'load' for Gemini/Doubao (networkidle may timeout due to ongoing requests)
|
|
72
|
+
wait_strategy = 'load' if platform in ['gemini', 'doubao'] else 'networkidle'
|
|
73
|
+
page.goto(url, wait_until=wait_strategy, timeout=timeout)
|
|
74
|
+
|
|
75
|
+
# Wait for content to load
|
|
76
|
+
# Try to wait for main selector (works for ChatGPT)
|
|
77
|
+
try:
|
|
78
|
+
page.wait_for_selector('main', timeout=10000)
|
|
79
|
+
except PlaywrightTimeoutError:
|
|
80
|
+
# Some platforms may not have 'main' element, continue anyway
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
# Additional wait for dynamic content based on platform
|
|
84
|
+
page.wait_for_timeout(wait_time)
|
|
85
|
+
|
|
86
|
+
# Extract plain text from body
|
|
87
|
+
content = page.inner_text('body')
|
|
88
|
+
|
|
89
|
+
browser.close()
|
|
90
|
+
|
|
91
|
+
return content.strip()
|
|
92
|
+
|
|
93
|
+
except PlaywrightTimeoutError as e:
|
|
94
|
+
raise PlaywrightTimeoutError(
|
|
95
|
+
f"Failed to load page within {timeout}ms. "
|
|
96
|
+
"Check your network connection and URL validity."
|
|
97
|
+
) from e
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
# Manual test
|
|
102
|
+
import sys
|
|
103
|
+
if len(sys.argv) > 1:
|
|
104
|
+
url = sys.argv[1]
|
|
105
|
+
print(f"Extracting from: {url}")
|
|
106
|
+
content = extract_from_url(url)
|
|
107
|
+
print(f"Extracted {len(content)} characters")
|
|
108
|
+
print(content[:500])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aichat2md
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Convert AI chat conversations to structured Markdown
|
|
5
5
|
Author: PlaceNameDay
|
|
6
6
|
License: MIT
|
|
@@ -32,7 +32,7 @@ Convert AI chat conversations to structured Markdown documents.
|
|
|
32
32
|
|
|
33
33
|
## Features
|
|
34
34
|
|
|
35
|
-
- 🌐 **Extract from URLs** - ChatGPT share links (with JS rendering via Playwright)
|
|
35
|
+
- 🌐 **Extract from URLs** - ChatGPT, Gemini, Doubao share links (with JS rendering via Playwright)
|
|
36
36
|
- 📄 **Extract from webarchive** - Safari .webarchive files (offline mode)
|
|
37
37
|
- 🤖 **Multiple AI backends** - DeepSeek, OpenAI, Groq, or any OpenAI-compatible API
|
|
38
38
|
- 🌍 **Bilingual support** - English/Chinese prompts
|
|
@@ -55,6 +55,30 @@ aichat2md https://chatgpt.com/share/xxx
|
|
|
55
55
|
aichat2md ~/Downloads/chat.webarchive
|
|
56
56
|
```
|
|
57
57
|
|
|
58
|
+
## Supported Platforms
|
|
59
|
+
|
|
60
|
+
- **ChatGPT** - chatgpt.com share links
|
|
61
|
+
- **Gemini** - gemini.google.com or g.co share links
|
|
62
|
+
- **Doubao (豆包)** - doubao.com share links
|
|
63
|
+
- **Webarchive** - Safari exported .webarchive files (any platform)
|
|
64
|
+
|
|
65
|
+
### Usage Examples
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# ChatGPT
|
|
69
|
+
aichat2md https://chatgpt.com/share/xxx
|
|
70
|
+
|
|
71
|
+
# Gemini (supports both long and short URLs)
|
|
72
|
+
aichat2md https://gemini.google.com/share/xxx
|
|
73
|
+
aichat2md https://g.co/gemini/share/xxx
|
|
74
|
+
|
|
75
|
+
# Doubao
|
|
76
|
+
aichat2md https://www.doubao.com/thread/xxx
|
|
77
|
+
|
|
78
|
+
# Webarchive file
|
|
79
|
+
aichat2md ~/Downloads/conversation.webarchive
|
|
80
|
+
```
|
|
81
|
+
|
|
58
82
|
## Supported AI Backends
|
|
59
83
|
|
|
60
84
|
- **DeepSeek** (default) - Cost-effective, Chinese service
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
"""Extract content from ChatGPT share URLs using Playwright."""
|
|
2
|
-
|
|
3
|
-
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def extract_from_url(url: str, timeout: int = 30000) -> str:
|
|
7
|
-
"""
|
|
8
|
-
Extract text content from ChatGPT share URL.
|
|
9
|
-
|
|
10
|
-
Args:
|
|
11
|
-
url: ChatGPT share URL (e.g., https://chatgpt.com/share/...)
|
|
12
|
-
timeout: Page load timeout in milliseconds
|
|
13
|
-
|
|
14
|
-
Returns:
|
|
15
|
-
Extracted plain text content
|
|
16
|
-
|
|
17
|
-
Raises:
|
|
18
|
-
PlaywrightTimeoutError: If page fails to load
|
|
19
|
-
ValueError: If URL is invalid
|
|
20
|
-
"""
|
|
21
|
-
if not url.startswith('http'):
|
|
22
|
-
raise ValueError(f"Invalid URL: {url}")
|
|
23
|
-
|
|
24
|
-
try:
|
|
25
|
-
with sync_playwright() as p:
|
|
26
|
-
browser = p.chromium.launch(headless=True)
|
|
27
|
-
page = browser.new_page()
|
|
28
|
-
|
|
29
|
-
# Navigate and wait for network idle
|
|
30
|
-
page.goto(url, wait_until='networkidle', timeout=timeout)
|
|
31
|
-
|
|
32
|
-
# Wait for conversation content to load
|
|
33
|
-
# ChatGPT share pages typically have conversation in main content area
|
|
34
|
-
page.wait_for_selector('main', timeout=10000)
|
|
35
|
-
|
|
36
|
-
# Extract plain text from body
|
|
37
|
-
content = page.inner_text('body')
|
|
38
|
-
|
|
39
|
-
browser.close()
|
|
40
|
-
|
|
41
|
-
return content.strip()
|
|
42
|
-
|
|
43
|
-
except PlaywrightTimeoutError as e:
|
|
44
|
-
raise PlaywrightTimeoutError(
|
|
45
|
-
f"Failed to load page within {timeout}ms. "
|
|
46
|
-
"Check your network connection and URL validity."
|
|
47
|
-
) from e
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if __name__ == "__main__":
|
|
51
|
-
# Manual test
|
|
52
|
-
import sys
|
|
53
|
-
if len(sys.argv) > 1:
|
|
54
|
-
url = sys.argv[1]
|
|
55
|
-
print(f"Extracting from: {url}")
|
|
56
|
-
content = extract_from_url(url)
|
|
57
|
-
print(f"Extracted {len(content)} characters")
|
|
58
|
-
print(content[:500])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|