@tikomni/skills 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -1
- package/README.zh-CN.md +50 -1
- package/package.json +1 -1
- package/skills/social-media-crawl/SKILL.md +48 -30
- package/skills/social-media-crawl/agents/openai.yaml +2 -3
- package/skills/social-media-crawl/references/contracts/output-envelope.md +5 -6
- package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md +8 -8
- package/skills/social-media-crawl/references/guides/generic-mcp-objects.md +22 -12
- package/skills/social-media-crawl/references/mcp-usage-contract.md +23 -13
- package/skills/social-media-crawl/references/pipelines/douyin-creator-home.md +4 -5
- package/skills/social-media-crawl/references/pipelines/douyin-single-work.md +4 -5
- package/skills/social-media-crawl/references/pipelines/xiaohongshu-creator-home.md +4 -5
- package/skills/social-media-crawl/references/pipelines/xiaohongshu-single-work.md +4 -5
- package/skills/social-media-crawl/references/service-guides/u2-u3-mandatory-fallback.md +10 -11
- package/skills/social-media-crawl/scripts/core/mcp_dispatch.py +3 -0
- package/skills/social-media-crawl/scripts/core/tikomni_common.py +7 -0
- package/skills/social-media-crawl/scripts/core/u3_fallback.py +146 -28
package/README.md
CHANGED
|
@@ -79,7 +79,56 @@ The npm package currently exposes:
|
|
|
79
79
|
|
|
80
80
|
### 2. Install skills
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
Before installation, prepare:
|
|
83
|
+
|
|
84
|
+
- `Node.js >= 18`
|
|
85
|
+
- `Python 3`
|
|
86
|
+
|
|
87
|
+
Common setup commands:
|
|
88
|
+
|
|
89
|
+
macOS (Homebrew):
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# If Homebrew is not installed yet:
|
|
93
|
+
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
|
94
|
+
|
|
95
|
+
brew install node
|
|
96
|
+
brew install python
|
|
97
|
+
|
|
98
|
+
node -v
|
|
99
|
+
npm -v
|
|
100
|
+
python3 --version
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Linux (Ubuntu / Debian; also works in WSL):
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
sudo apt update
|
|
107
|
+
sudo apt install -y curl python3 python3-pip
|
|
108
|
+
|
|
109
|
+
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash
|
|
110
|
+
export NVM_DIR="$HOME/.nvm"
|
|
111
|
+
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
|
|
112
|
+
|
|
113
|
+
nvm install --lts
|
|
114
|
+
|
|
115
|
+
node -v
|
|
116
|
+
npm -v
|
|
117
|
+
python3 --version
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Windows (PowerShell):
|
|
121
|
+
|
|
122
|
+
```powershell
|
|
123
|
+
winget install -e --id OpenJS.NodeJS.LTS
|
|
124
|
+
winget install -e --id Python.Python.3
|
|
125
|
+
|
|
126
|
+
node -v
|
|
127
|
+
npm -v
|
|
128
|
+
python --version
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
After that, install directly from npm.
|
|
83
132
|
|
|
84
133
|
First, list the currently available skills:
|
|
85
134
|
|
package/README.zh-CN.md
CHANGED
|
@@ -79,7 +79,56 @@
|
|
|
79
79
|
|
|
80
80
|
### 2. 安装 Skills
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
安装前请先准备:
|
|
83
|
+
|
|
84
|
+
- `Node.js >= 18`
|
|
85
|
+
- `Python 3`
|
|
86
|
+
|
|
87
|
+
常见安装命令:
|
|
88
|
+
|
|
89
|
+
macOS(Homebrew):
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# 如未安装 Homebrew,先执行:
|
|
93
|
+
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
|
94
|
+
|
|
95
|
+
brew install node
|
|
96
|
+
brew install python
|
|
97
|
+
|
|
98
|
+
node -v
|
|
99
|
+
npm -v
|
|
100
|
+
python3 --version
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Linux(Ubuntu / Debian;WSL 可同样使用):
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
sudo apt update
|
|
107
|
+
sudo apt install -y curl python3 python3-pip
|
|
108
|
+
|
|
109
|
+
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash
|
|
110
|
+
export NVM_DIR="$HOME/.nvm"
|
|
111
|
+
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
|
|
112
|
+
|
|
113
|
+
nvm install --lts
|
|
114
|
+
|
|
115
|
+
node -v
|
|
116
|
+
npm -v
|
|
117
|
+
python3 --version
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Windows(PowerShell):
|
|
121
|
+
|
|
122
|
+
```powershell
|
|
123
|
+
winget install -e --id OpenJS.NodeJS.LTS
|
|
124
|
+
winget install -e --id Python.Python.3
|
|
125
|
+
|
|
126
|
+
node -v
|
|
127
|
+
npm -v
|
|
128
|
+
python --version
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
准备完成后,使用 npm 直接安装。
|
|
83
132
|
|
|
84
133
|
先查看当前可安装的 Skills:
|
|
85
134
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.10",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -7,48 +7,66 @@ description: Use this skill when the user asks about social media links, posts,
|
|
|
7
7
|
|
|
8
8
|
## When To Use
|
|
9
9
|
|
|
10
|
-
-
|
|
11
|
-
-
|
|
12
|
-
-
|
|
10
|
+
- Use this skill when the user needs any social-media object such as a work link, post, thread, long-form post, creator homepage, comment section, search result, ranking page, livestream room, or product page.
|
|
11
|
+
- Use this skill when the input is a share short link, work URL, post URL, thread URL, homepage URL, platform ID, keyword, or entry page.
|
|
12
|
+
- Route social-media retrieval requests through this skill first.
|
|
13
|
+
|
|
14
|
+
## Supported Platforms
|
|
15
|
+
|
|
16
|
+
- The currently supported platforms include Douyin, Xiaohongshu, Kuaishou, Bilibili, Weibo, TikTok, YouTube, Instagram, Threads, Twitter/X, Reddit, LinkedIn, WeChat Channels, Official Accounts, Toutiao, Xigua, Zhihu, Lemon8, and Pipixia.
|
|
17
|
+
- Only four fixed pipelines are currently frozen: Douyin single work, Douyin creator home, Xiaohongshu single work, and Xiaohongshu creator home.
|
|
18
|
+
- All other supported platform and object combinations should use the generic MCP workflow inside this skill.
|
|
19
|
+
- See the official documentation for the full platform catalog and update policy: https://docs.tikomni.com
|
|
13
20
|
|
|
14
21
|
## What To Help With
|
|
15
22
|
|
|
16
|
-
-
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
-
|
|
23
|
+
- Detect the platform and object type.
|
|
24
|
+
- Prefer single work, single post, thread, creator homepage, and content collection tasks.
|
|
25
|
+
- Collect factual fields, platform text, subtitles, or transcript text.
|
|
26
|
+
- Return structured JSON and write fact cards when required.
|
|
20
27
|
|
|
21
28
|
## Preferred Routing
|
|
22
29
|
|
|
23
|
-
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
- Route all social-media objects into this skill first; do not jump straight to browser/CDP only because a fixed pipeline does not match.
|
|
31
|
+
- If the object is one of the following four cases, prefer the fixed pipeline:
|
|
32
|
+
1. Douyin single work
|
|
33
|
+
2. Xiaohongshu single work
|
|
34
|
+
3. Douyin creator home
|
|
35
|
+
4. Xiaohongshu creator home
|
|
36
|
+
- If no fixed pipeline matches, stay inside this skill and use the generic MCP path: `tools/list -> catalog.search -> endpoint.describe -> api.call`.
|
|
37
|
+
- Use browser/CDP only when both the fixed pipeline and the generic MCP path are unavailable, or when the task explicitly requires page-level interaction that the API cannot satisfy. Record the reason in the result.
|
|
29
38
|
|
|
30
39
|
## Working Style
|
|
31
40
|
|
|
32
|
-
-
|
|
33
|
-
-
|
|
34
|
-
-
|
|
35
|
-
-
|
|
36
|
-
-
|
|
41
|
+
- Detect the platform and object type first, then choose between a fixed pipeline and the generic MCP path.
|
|
42
|
+
- Treat browser/CDP as a fallback, not the default option.
|
|
43
|
+
- Prioritize factual fields and avoid subjective analysis.
|
|
44
|
+
- Prefer native subtitles for video text; use ASR only when subtitles are unavailable.
|
|
45
|
+
- Default to structured JSON plus Markdown and persist outputs proactively.
|
|
46
|
+
- When a fixed pipeline matches, execute it and persist outputs by default.
|
|
47
|
+
|
|
48
|
+
## Typical Examples
|
|
49
|
+
|
|
50
|
+
- Extract the body text and factual fields from a single X/Twitter post.
|
|
51
|
+
- Collect TikTok comment data and normalize it into a unified structured result.
|
|
52
|
+
- Extract a creator profile and content collection from a homepage.
|
|
53
|
+
- Collect the top N comments from a comment section.
|
|
54
|
+
- Collect search results, ranking pages, livestream rooms, or product pages.
|
|
37
55
|
|
|
38
56
|
## Workflow
|
|
39
57
|
|
|
40
|
-
1.
|
|
41
|
-
2.
|
|
42
|
-
3.
|
|
43
|
-
4.
|
|
44
|
-
5.
|
|
45
|
-
6.
|
|
58
|
+
1. Detect the platform and object type.
|
|
59
|
+
2. If a fixed pipeline matches, run the fixed script directly.
|
|
60
|
+
3. If no fixed pipeline matches, call `tools/list` first, then choose the smallest MCP toolchain through `catalog.search -> endpoint.describe -> api.call`.
|
|
61
|
+
4. If video text is required, run `u2.submit -> u2.query`; if it is still unfinished after the timeout, enter the U3 fallback path.
|
|
62
|
+
5. Normalize the result into a structured payload.
|
|
63
|
+
6. If the object is a work, write `work_fact_card`.
|
|
46
64
|
|
|
47
65
|
## References
|
|
48
66
|
|
|
49
|
-
- MCP
|
|
50
|
-
-
|
|
51
|
-
-
|
|
52
|
-
-
|
|
53
|
-
- U2/U3
|
|
54
|
-
-
|
|
67
|
+
- MCP usage contract: `references/mcp-usage-contract.md`
|
|
68
|
+
- Output envelope: `references/contracts/output-envelope.md`
|
|
69
|
+
- Work fact card fields: `references/contracts/work-fact-card-fields.md`
|
|
70
|
+
- Generic MCP object guide: `references/guides/generic-mcp-objects.md`
|
|
71
|
+
- U2/U3 rules: `references/service-guides/u2-u3-mandatory-fallback.md`
|
|
72
|
+
- Fixed pipeline guides: `references/pipelines/`
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
interface:
|
|
2
2
|
display_name: "Social Media Crawl"
|
|
3
|
-
short_description: "
|
|
4
|
-
default_prompt: "Use $social-media-crawl to fetch, transcribe, normalize, and archive
|
|
5
|
-
|
|
3
|
+
short_description: "Cross-platform structured social-media retrieval and archival"
|
|
4
|
+
default_prompt: "Use $social-media-crawl to fetch, transcribe, normalize, and archive social-media objects into structured JSON and work fact cards. Supported platforms include Douyin, Xiaohongshu, Kuaishou, Bilibili, Weibo, TikTok, YouTube, Instagram, Threads, Twitter/X, Reddit, LinkedIn, WeChat Channels, WeChat Official Accounts, Toutiao, Xigua, Zhihu, Lemon8, and Pipixia. Use fixed pipelines for Douyin/Xiaohongshu single-work and creator-home requests; use the MCP generic flow for other supported platforms and objects; use browser fallback only when the fixed pipelines and MCP path are insufficient."
|
|
@@ -14,9 +14,8 @@
|
|
|
14
14
|
|
|
15
15
|
## Semantics
|
|
16
16
|
|
|
17
|
-
- `normalized`
|
|
18
|
-
- `completeness`
|
|
19
|
-
- `missing_fields`
|
|
20
|
-
- `error_reason`
|
|
21
|
-
- `extract_trace`
|
|
22
|
-
|
|
17
|
+
- `normalized` stores structured facts only, not analytical inference.
|
|
18
|
+
- `completeness` allows `complete`, `partial`, and `incomplete`.
|
|
19
|
+
- `missing_fields` is the list of missing fields.
|
|
20
|
+
- `error_reason` may be an empty string or `null` on success.
|
|
21
|
+
- `extract_trace` records the fixed-pipeline or MCP dispatch steps. If the flow ends in browser/CDP fallback, it must also record the earlier MCP attempts and the fallback reason.
|
|
@@ -36,15 +36,15 @@
|
|
|
36
36
|
|
|
37
37
|
## Field Rules
|
|
38
38
|
|
|
39
|
-
- `author`
|
|
40
|
-
- Markdown
|
|
41
|
-
-
|
|
42
|
-
- `primary_text`
|
|
43
|
-
- `play_count`
|
|
44
|
-
-
|
|
39
|
+
- `author` is the display name, not an object.
|
|
40
|
+
- Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
|
|
41
|
+
- The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
|
|
42
|
+
- `primary_text` is the text that is best suited for reading and indexing in the current task.
|
|
43
|
+
- `play_count` may be `null`. Leave it empty when missing, and keep `0` only when the platform explicitly returns `0`.
|
|
44
|
+
- Preferred order for video works:
|
|
45
45
|
- `subtitle_raw`
|
|
46
46
|
- `asr_clean`
|
|
47
47
|
- `caption_raw`
|
|
48
|
-
-
|
|
48
|
+
- Preferred order for text works:
|
|
49
49
|
- `caption_raw`
|
|
50
|
-
-
|
|
50
|
+
- Do not add analytical fields.
|
|
@@ -1,20 +1,31 @@
|
|
|
1
1
|
# Generic MCP Objects Guide
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
The following objects do not freeze a fine-grained schema in the first release:
|
|
4
4
|
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
-
|
|
5
|
+
- Comment threads
|
|
6
|
+
- Search results
|
|
7
|
+
- Ranking pages
|
|
8
|
+
- Livestream rooms
|
|
9
|
+
- Product pages
|
|
10
|
+
|
|
11
|
+
In addition, every platform and object combination that does not match a fixed pipeline falls under this guide, for example:
|
|
12
|
+
|
|
13
|
+
- A single X/Twitter post
|
|
14
|
+
- An X/Twitter thread
|
|
15
|
+
- An X/Twitter long-form post
|
|
16
|
+
- An X/Twitter creator homepage
|
|
17
|
+
- The top N comments from a comment section
|
|
10
18
|
|
|
11
19
|
## Rules
|
|
12
20
|
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
21
|
+
- Route these objects through the generic MCP workflow inside this skill.
|
|
22
|
+
- The platform is not limited to the Douyin and Xiaohongshu cases covered by fixed pipelines. If the platform is discoverable in the MCP catalog, try this workflow first.
|
|
23
|
+
- Detect the object first, then use `catalog.search` and `endpoint.describe` to choose the smallest toolchain.
|
|
24
|
+
- Do not jump to browser/CDP only because the platform is not Douyin or Xiaohongshu.
|
|
25
|
+
- Use browser/CDP only when the generic MCP path is unavailable, or when the task explicitly requires page-level interaction that the API cannot satisfy. Explain the reason in the output.
|
|
26
|
+
- The output must satisfy the unified envelope.
|
|
27
|
+
- No card write is required in the first release.
|
|
28
|
+
- Do not fabricate fields only to satisfy schema completeness.
|
|
18
29
|
|
|
19
30
|
## Minimum Deliverable
|
|
20
31
|
|
|
@@ -27,4 +38,3 @@
|
|
|
27
38
|
- `missing_fields`
|
|
28
39
|
- `error_reason`
|
|
29
40
|
- `extract_trace`
|
|
30
|
-
|
|
@@ -1,30 +1,40 @@
|
|
|
1
1
|
# MCP Usage Contract
|
|
2
2
|
|
|
3
|
+
## Scope
|
|
4
|
+
|
|
5
|
+
- This contract applies to every social-media object that does not match a fixed pipeline, not only Douyin and Xiaohongshu.
|
|
6
|
+
- The currently supported platforms include Douyin, Xiaohongshu, Kuaishou, Bilibili, Weibo, TikTok, YouTube, Instagram, Threads, Twitter/X, Reddit, LinkedIn, WeChat Channels, Official Accounts, Toutiao, Xigua, Zhihu, Lemon8, and Pipixia.
|
|
7
|
+
- Typical objects include X/Twitter posts, threads, long-form posts, creator homepages, comment sections, search results, ranking pages, livestream rooms, and product pages.
|
|
8
|
+
- Fixed pipelines are frozen only for Douyin and Xiaohongshu single-work and creator-home cases. All other supported platform and object combinations should use the generic MCP path defined here.
|
|
9
|
+
|
|
3
10
|
## Fixed Inputs
|
|
4
11
|
|
|
5
12
|
- MCP URL: `https://mcp.tikomni.com/mcp`
|
|
6
13
|
- Auth: `Authorization: Bearer <TIKOMNI_API_KEY>`
|
|
7
|
-
-
|
|
14
|
+
- Do not repeat the API key inside tool parameters.
|
|
8
15
|
|
|
9
16
|
## Required Tool Order
|
|
10
17
|
|
|
11
|
-
1.
|
|
12
|
-
2.
|
|
13
|
-
3.
|
|
18
|
+
1. Detect the platform and object type.
|
|
19
|
+
2. Decide whether a fixed pipeline matches.
|
|
20
|
+
3. If a fixed pipeline matches, run the fixed script directly and do not enter the generic MCP path.
|
|
21
|
+
4. If no fixed pipeline matches:
|
|
22
|
+
- `tools/list`
|
|
14
23
|
- `catalog.search`
|
|
15
24
|
- `endpoint.describe`
|
|
16
25
|
- `api.call`
|
|
17
|
-
|
|
26
|
+
5. If video text is required:
|
|
18
27
|
- `u2.submit`
|
|
19
28
|
- `u2.query`
|
|
20
|
-
-
|
|
29
|
+
- Enter the U3 fallback path if the task is still `pending` after 60 seconds.
|
|
30
|
+
6. Use browser/CDP only when the generic MCP path is unavailable or clearly insufficient. Do not skip step 4 and jump straight to browser/CDP.
|
|
21
31
|
|
|
22
32
|
## Output Rules
|
|
23
33
|
|
|
24
|
-
-
|
|
25
|
-
-
|
|
26
|
-
-
|
|
27
|
-
-
|
|
28
|
-
-
|
|
29
|
-
-
|
|
30
|
-
|
|
34
|
+
- Keep factual fields separate from derived metadata.
|
|
35
|
+
- The result must include `request_id`.
|
|
36
|
+
- The result must include `completeness`.
|
|
37
|
+
- The result must include `missing_fields`.
|
|
38
|
+
- The result must include `error_reason`.
|
|
39
|
+
- The result must include `extract_trace`.
|
|
40
|
+
- If the flow ends in browser/CDP fallback, `extract_trace` must also include the earlier MCP attempts and the fallback reason.
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Douyin Creator Home
|
|
2
2
|
|
|
3
|
-
-
|
|
4
|
-
-
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
|
|
3
|
+
- Input: a homepage URL or share short link
|
|
4
|
+
- Output: `creator_profile.json`, `work_collection.json`, and multiple `work_fact_card` files
|
|
5
|
+
- Implementation script: `scripts/pipelines/run_douyin_creator_home.py`
|
|
6
|
+
- Required test sample: the frozen Douyin creator-home sample defined by the plan
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Douyin Single Work
|
|
2
2
|
|
|
3
|
-
-
|
|
4
|
-
-
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
|
|
3
|
+
- Input: a work URL or share short link
|
|
4
|
+
- Output: a unified `work_fact_card`
|
|
5
|
+
- Implementation script: `scripts/pipelines/run_douyin_single_work.py`
|
|
6
|
+
- Required test sample: the frozen Douyin work sample defined by the plan
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Xiaohongshu Creator Home
|
|
2
2
|
|
|
3
|
-
-
|
|
4
|
-
-
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
|
|
3
|
+
- Input: a homepage URL or share short link
|
|
4
|
+
- Output: `creator_profile.json`, `work_collection.json`, and multiple `work_fact_card` files
|
|
5
|
+
- Implementation script: `scripts/pipelines/run_xiaohongshu_creator_home.py`
|
|
6
|
+
- Required test sample: the frozen Xiaohongshu creator-home sample defined by the plan
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Xiaohongshu Single Work
|
|
2
2
|
|
|
3
|
-
-
|
|
4
|
-
-
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
|
|
3
|
+
- Input: a work URL or share short link
|
|
4
|
+
- Output: a unified `work_fact_card`
|
|
5
|
+
- Implementation script: `scripts/pipelines/run_xiaohongshu_single_work.py`
|
|
6
|
+
- Required test sample: the frozen Xiaohongshu work sample defined by the plan
|
|
@@ -2,20 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
## Hard Rule
|
|
4
4
|
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
- `u2.submit`
|
|
8
|
-
-
|
|
9
|
-
- U3
|
|
10
|
-
-
|
|
5
|
+
- Prefer native platform subtitles for video text.
|
|
6
|
+
- If native subtitles are unavailable, use U2 first.
|
|
7
|
+
- After `u2.submit` succeeds, poll for at most 60 seconds.
|
|
8
|
+
- If the task is still `pending` after 60 seconds, enter the U3 fallback path.
|
|
9
|
+
- If U3 succeeds, continue and complete the ASR path.
|
|
10
|
+
- If the flow still fails, keep the fact card and mark `completeness=incomplete`.
|
|
11
11
|
|
|
12
12
|
## Trace Requirement
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
Record at least:
|
|
15
15
|
|
|
16
16
|
- U2 submit
|
|
17
17
|
- U2 poll
|
|
18
|
-
-
|
|
19
|
-
- U3
|
|
20
|
-
-
|
|
21
|
-
|
|
18
|
+
- The reason that triggered U3
|
|
19
|
+
- The U3 invocation steps
|
|
20
|
+
- The final text source
|
|
@@ -48,6 +48,9 @@ class McpHttpClient:
|
|
|
48
48
|
"Authorization": f"Bearer {self.api_key}",
|
|
49
49
|
"Content-Type": "application/json",
|
|
50
50
|
"Accept": "application/json, text/event-stream",
|
|
51
|
+
"User-Agent": "OpenClaw-SocialMediaCrawl/0.1",
|
|
52
|
+
"X-Client-Name": "social-media-crawl",
|
|
53
|
+
"X-Client-Version": "0.1.0",
|
|
51
54
|
}
|
|
52
55
|
if self.session_id:
|
|
53
56
|
headers["mcp-session-id"] = self.session_id
|
|
@@ -257,6 +257,13 @@ def _resolve_timeout_retry_backoff_ms() -> int:
|
|
|
257
257
|
return max(0, min(backoff, 5000))
|
|
258
258
|
|
|
259
259
|
|
|
260
|
+
def resolve_timeout_retry_policy() -> Dict[str, int]:
|
|
261
|
+
return {
|
|
262
|
+
"max_retries": _resolve_timeout_retry_max(),
|
|
263
|
+
"backoff_ms": _resolve_timeout_retry_backoff_ms(),
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
|
|
260
267
|
def _wait_rate_limit_slot(qps: float) -> int:
|
|
261
268
|
global _NEXT_ALLOWED_TS
|
|
262
269
|
interval_sec = 1.0 / max(qps, 0.1)
|
|
@@ -5,18 +5,26 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import mimetypes
|
|
7
7
|
import os
|
|
8
|
+
import socket
|
|
8
9
|
import tempfile
|
|
10
|
+
import time
|
|
9
11
|
import urllib.error
|
|
10
12
|
import urllib.parse
|
|
11
13
|
import urllib.request
|
|
12
14
|
from pathlib import Path
|
|
13
|
-
from typing import Any, Dict, Optional
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
14
16
|
|
|
15
|
-
from scripts.core.tikomni_common import
|
|
17
|
+
from scripts.core.tikomni_common import (
|
|
18
|
+
DEFAULT_USER_AGENT,
|
|
19
|
+
call_json_api,
|
|
20
|
+
normalize_text,
|
|
21
|
+
resolve_timeout_retry_policy,
|
|
22
|
+
)
|
|
16
23
|
|
|
17
24
|
DEFAULT_U3_PROVIDER = "oss"
|
|
18
25
|
DEFAULT_CONTENT_TYPE = "video/mp4"
|
|
19
26
|
DOWNLOAD_CHUNK_SIZE = 1024 * 1024
|
|
27
|
+
TIMEOUT_LIKE_HTTP_STATUS_CODES = {408, 429, 502, 503, 504}
|
|
20
28
|
|
|
21
29
|
|
|
22
30
|
def _safe_name_from_url(source_url: str) -> str:
|
|
@@ -135,6 +143,16 @@ def create_u3_upload(
|
|
|
135
143
|
)
|
|
136
144
|
|
|
137
145
|
|
|
146
|
+
def _is_timeout_like_upload_error(status_code: Optional[int], error_reason: Optional[str]) -> bool:
|
|
147
|
+
if isinstance(status_code, (int, float)) and int(status_code) in TIMEOUT_LIKE_HTTP_STATUS_CODES:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
reason = str(error_reason or "").strip().lower()
|
|
151
|
+
if not reason:
|
|
152
|
+
return False
|
|
153
|
+
return any(token in reason for token in ("timeout", "timed out", "deadline exceeded"))
|
|
154
|
+
|
|
155
|
+
|
|
138
156
|
def upload_file_to_presigned_url(
|
|
139
157
|
*,
|
|
140
158
|
upload_url: str,
|
|
@@ -147,35 +165,130 @@ def upload_file_to_presigned_url(
|
|
|
147
165
|
try:
|
|
148
166
|
with open(file_path, "rb") as handle:
|
|
149
167
|
data = handle.read()
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
"
|
|
153
|
-
"
|
|
168
|
+
except Exception as error:
|
|
169
|
+
return {
|
|
170
|
+
"ok": False,
|
|
171
|
+
"status_code": None,
|
|
172
|
+
"error_reason": f"u3_upload_failed:{normalize_text(error)}",
|
|
173
|
+
"retry_attempt": 0,
|
|
174
|
+
"timeout_retry_max": 0,
|
|
175
|
+
"timeout_retry_exhausted": False,
|
|
176
|
+
"retry_chain": [],
|
|
154
177
|
}
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
headers=
|
|
166
|
-
|
|
178
|
+
|
|
179
|
+
headers = {
|
|
180
|
+
"Content-Type": content_type or DEFAULT_CONTENT_TYPE,
|
|
181
|
+
"User-Agent": os.getenv("TIKOMNI_HTTP_USER_AGENT", DEFAULT_USER_AGENT),
|
|
182
|
+
}
|
|
183
|
+
if isinstance(upload_headers, dict):
|
|
184
|
+
for key, value in upload_headers.items():
|
|
185
|
+
header_key = str(key).strip()
|
|
186
|
+
if not header_key:
|
|
187
|
+
continue
|
|
188
|
+
headers[header_key] = str(value)
|
|
189
|
+
|
|
190
|
+
retry_policy = resolve_timeout_retry_policy()
|
|
191
|
+
timeout_retry_max = int(retry_policy.get("max_retries", 0) or 0)
|
|
192
|
+
retry_backoff_ms = int(retry_policy.get("backoff_ms", 0) or 0)
|
|
193
|
+
max_attempts = 1 + timeout_retry_max
|
|
194
|
+
retry_chain: List[Dict[str, Any]] = []
|
|
195
|
+
last_result: Dict[str, Any] = {
|
|
196
|
+
"ok": False,
|
|
197
|
+
"status_code": None,
|
|
198
|
+
"error_reason": "u3_upload_failed:unknown",
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
for attempt in range(1, max_attempts + 1):
|
|
202
|
+
if attempt > 1 and retry_backoff_ms > 0:
|
|
203
|
+
sleep_ms = retry_backoff_ms * (2 ** (attempt - 2))
|
|
204
|
+
time.sleep(sleep_ms / 1000.0)
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
request = urllib.request.Request(
|
|
208
|
+
upload_url,
|
|
209
|
+
data=data,
|
|
210
|
+
headers=headers,
|
|
211
|
+
method=(upload_method or "PUT").upper(),
|
|
212
|
+
)
|
|
213
|
+
with urllib.request.urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
|
|
214
|
+
status_code = response.getcode()
|
|
215
|
+
result: Dict[str, Any] = {
|
|
216
|
+
"ok": 200 <= int(status_code) < 300,
|
|
217
|
+
"status_code": status_code,
|
|
218
|
+
"error_reason": None if 200 <= int(status_code) < 300 else f"u3_upload_http_{status_code}",
|
|
219
|
+
}
|
|
220
|
+
except urllib.error.HTTPError as error:
|
|
221
|
+
result = {
|
|
222
|
+
"ok": False,
|
|
223
|
+
"status_code": error.code,
|
|
224
|
+
"error_reason": f"u3_upload_http_{error.code}",
|
|
225
|
+
}
|
|
226
|
+
except urllib.error.URLError as error:
|
|
227
|
+
reason_obj = getattr(error, "reason", error)
|
|
228
|
+
reason_text = normalize_text(reason_obj)
|
|
229
|
+
result = {
|
|
230
|
+
"ok": False,
|
|
231
|
+
"status_code": None,
|
|
232
|
+
"error_reason": f"u3_upload_failed:{reason_text or 'network_error'}",
|
|
233
|
+
"_timeout_like": isinstance(reason_obj, socket.timeout)
|
|
234
|
+
or _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
|
|
235
|
+
}
|
|
236
|
+
except (TimeoutError, socket.timeout) as error:
|
|
237
|
+
result = {
|
|
238
|
+
"ok": False,
|
|
239
|
+
"status_code": None,
|
|
240
|
+
"error_reason": f"u3_upload_failed:{normalize_text(error) or 'timeout'}",
|
|
241
|
+
"_timeout_like": True,
|
|
242
|
+
}
|
|
243
|
+
except Exception as error:
|
|
244
|
+
reason_text = normalize_text(error)
|
|
245
|
+
result = {
|
|
246
|
+
"ok": False,
|
|
247
|
+
"status_code": None,
|
|
248
|
+
"error_reason": f"u3_upload_failed:{reason_text or 'unknown'}",
|
|
249
|
+
"_timeout_like": _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if result.get("ok"):
|
|
253
|
+
result["retry_attempt"] = max(0, attempt - 1)
|
|
254
|
+
result["timeout_retry_max"] = timeout_retry_max
|
|
255
|
+
result["timeout_retry_exhausted"] = False
|
|
256
|
+
result["retry_chain"] = retry_chain
|
|
257
|
+
return result
|
|
258
|
+
|
|
259
|
+
timeout_like = bool(
|
|
260
|
+
result.pop(
|
|
261
|
+
"_timeout_like",
|
|
262
|
+
_is_timeout_like_upload_error(
|
|
263
|
+
status_code=result.get("status_code"),
|
|
264
|
+
error_reason=result.get("error_reason"),
|
|
265
|
+
),
|
|
266
|
+
)
|
|
167
267
|
)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
"
|
|
172
|
-
"
|
|
173
|
-
"
|
|
268
|
+
retry_chain.append(
|
|
269
|
+
{
|
|
270
|
+
"attempt": attempt,
|
|
271
|
+
"status_code": result.get("status_code"),
|
|
272
|
+
"error_reason": result.get("error_reason"),
|
|
273
|
+
"timeout_like": timeout_like,
|
|
174
274
|
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
275
|
+
)
|
|
276
|
+
last_result = dict(result)
|
|
277
|
+
|
|
278
|
+
if timeout_like and attempt < max_attempts:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
last_result["retry_attempt"] = max(0, attempt - 1)
|
|
282
|
+
last_result["timeout_retry_max"] = timeout_retry_max
|
|
283
|
+
last_result["timeout_retry_exhausted"] = bool(timeout_like and attempt >= max_attempts)
|
|
284
|
+
last_result["retry_chain"] = retry_chain
|
|
285
|
+
return last_result
|
|
286
|
+
|
|
287
|
+
last_result["retry_attempt"] = timeout_retry_max
|
|
288
|
+
last_result["timeout_retry_max"] = timeout_retry_max
|
|
289
|
+
last_result["timeout_retry_exhausted"] = True
|
|
290
|
+
last_result["retry_chain"] = retry_chain
|
|
291
|
+
return last_result
|
|
179
292
|
|
|
180
293
|
|
|
181
294
|
def complete_u3_upload(
|
|
@@ -284,6 +397,11 @@ def run_u3_public_url_fallback(
|
|
|
284
397
|
"ok": bool(upload_response.get("ok")),
|
|
285
398
|
"status_code": upload_response.get("status_code"),
|
|
286
399
|
"error_reason": upload_response.get("error_reason"),
|
|
400
|
+
"retry_attempt": upload_response.get("retry_attempt", 0),
|
|
401
|
+
"retry_count": len(upload_response.get("retry_chain") or []),
|
|
402
|
+
"timeout_retry_max": upload_response.get("timeout_retry_max", 0),
|
|
403
|
+
"timeout_retry_exhausted": bool(upload_response.get("timeout_retry_exhausted")),
|
|
404
|
+
"retry_chain": upload_response.get("retry_chain") or [],
|
|
287
405
|
}
|
|
288
406
|
)
|
|
289
407
|
if not upload_response.get("ok"):
|