@tikomni/skills 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/core/tikomni_common.py +7 -0
- package/skills/social-media-crawl/scripts/core/u3_fallback.py +146 -28
- package/skills/social-media-crawl/scripts/pipelines/douyin_metadata.py +151 -0
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +40 -37
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +5 -11
- package/skills/social-media-crawl/scripts/pipelines/input_contracts.py +318 -0
- package/skills/social-media-crawl/scripts/pipelines/media_url_rules.py +86 -0
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +77 -30
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +79 -73
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +86 -60
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +5 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.9",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -257,6 +257,13 @@ def _resolve_timeout_retry_backoff_ms() -> int:
|
|
|
257
257
|
return max(0, min(backoff, 5000))
|
|
258
258
|
|
|
259
259
|
|
|
260
|
+
def resolve_timeout_retry_policy() -> Dict[str, int]:
|
|
261
|
+
return {
|
|
262
|
+
"max_retries": _resolve_timeout_retry_max(),
|
|
263
|
+
"backoff_ms": _resolve_timeout_retry_backoff_ms(),
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
|
|
260
267
|
def _wait_rate_limit_slot(qps: float) -> int:
|
|
261
268
|
global _NEXT_ALLOWED_TS
|
|
262
269
|
interval_sec = 1.0 / max(qps, 0.1)
|
|
@@ -5,18 +5,26 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import mimetypes
|
|
7
7
|
import os
|
|
8
|
+
import socket
|
|
8
9
|
import tempfile
|
|
10
|
+
import time
|
|
9
11
|
import urllib.error
|
|
10
12
|
import urllib.parse
|
|
11
13
|
import urllib.request
|
|
12
14
|
from pathlib import Path
|
|
13
|
-
from typing import Any, Dict, Optional
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
14
16
|
|
|
15
|
-
from scripts.core.tikomni_common import
|
|
17
|
+
from scripts.core.tikomni_common import (
|
|
18
|
+
DEFAULT_USER_AGENT,
|
|
19
|
+
call_json_api,
|
|
20
|
+
normalize_text,
|
|
21
|
+
resolve_timeout_retry_policy,
|
|
22
|
+
)
|
|
16
23
|
|
|
17
24
|
DEFAULT_U3_PROVIDER = "oss"
|
|
18
25
|
DEFAULT_CONTENT_TYPE = "video/mp4"
|
|
19
26
|
DOWNLOAD_CHUNK_SIZE = 1024 * 1024
|
|
27
|
+
TIMEOUT_LIKE_HTTP_STATUS_CODES = {408, 429, 502, 503, 504}
|
|
20
28
|
|
|
21
29
|
|
|
22
30
|
def _safe_name_from_url(source_url: str) -> str:
|
|
@@ -135,6 +143,16 @@ def create_u3_upload(
|
|
|
135
143
|
)
|
|
136
144
|
|
|
137
145
|
|
|
146
|
+
def _is_timeout_like_upload_error(status_code: Optional[int], error_reason: Optional[str]) -> bool:
|
|
147
|
+
if isinstance(status_code, (int, float)) and int(status_code) in TIMEOUT_LIKE_HTTP_STATUS_CODES:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
reason = str(error_reason or "").strip().lower()
|
|
151
|
+
if not reason:
|
|
152
|
+
return False
|
|
153
|
+
return any(token in reason for token in ("timeout", "timed out", "deadline exceeded"))
|
|
154
|
+
|
|
155
|
+
|
|
138
156
|
def upload_file_to_presigned_url(
|
|
139
157
|
*,
|
|
140
158
|
upload_url: str,
|
|
@@ -147,35 +165,130 @@ def upload_file_to_presigned_url(
|
|
|
147
165
|
try:
|
|
148
166
|
with open(file_path, "rb") as handle:
|
|
149
167
|
data = handle.read()
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
"
|
|
153
|
-
"
|
|
168
|
+
except Exception as error:
|
|
169
|
+
return {
|
|
170
|
+
"ok": False,
|
|
171
|
+
"status_code": None,
|
|
172
|
+
"error_reason": f"u3_upload_failed:{normalize_text(error)}",
|
|
173
|
+
"retry_attempt": 0,
|
|
174
|
+
"timeout_retry_max": 0,
|
|
175
|
+
"timeout_retry_exhausted": False,
|
|
176
|
+
"retry_chain": [],
|
|
154
177
|
}
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
headers=
|
|
166
|
-
|
|
178
|
+
|
|
179
|
+
headers = {
|
|
180
|
+
"Content-Type": content_type or DEFAULT_CONTENT_TYPE,
|
|
181
|
+
"User-Agent": os.getenv("TIKOMNI_HTTP_USER_AGENT", DEFAULT_USER_AGENT),
|
|
182
|
+
}
|
|
183
|
+
if isinstance(upload_headers, dict):
|
|
184
|
+
for key, value in upload_headers.items():
|
|
185
|
+
header_key = str(key).strip()
|
|
186
|
+
if not header_key:
|
|
187
|
+
continue
|
|
188
|
+
headers[header_key] = str(value)
|
|
189
|
+
|
|
190
|
+
retry_policy = resolve_timeout_retry_policy()
|
|
191
|
+
timeout_retry_max = int(retry_policy.get("max_retries", 0) or 0)
|
|
192
|
+
retry_backoff_ms = int(retry_policy.get("backoff_ms", 0) or 0)
|
|
193
|
+
max_attempts = 1 + timeout_retry_max
|
|
194
|
+
retry_chain: List[Dict[str, Any]] = []
|
|
195
|
+
last_result: Dict[str, Any] = {
|
|
196
|
+
"ok": False,
|
|
197
|
+
"status_code": None,
|
|
198
|
+
"error_reason": "u3_upload_failed:unknown",
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
for attempt in range(1, max_attempts + 1):
|
|
202
|
+
if attempt > 1 and retry_backoff_ms > 0:
|
|
203
|
+
sleep_ms = retry_backoff_ms * (2 ** (attempt - 2))
|
|
204
|
+
time.sleep(sleep_ms / 1000.0)
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
request = urllib.request.Request(
|
|
208
|
+
upload_url,
|
|
209
|
+
data=data,
|
|
210
|
+
headers=headers,
|
|
211
|
+
method=(upload_method or "PUT").upper(),
|
|
212
|
+
)
|
|
213
|
+
with urllib.request.urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
|
|
214
|
+
status_code = response.getcode()
|
|
215
|
+
result: Dict[str, Any] = {
|
|
216
|
+
"ok": 200 <= int(status_code) < 300,
|
|
217
|
+
"status_code": status_code,
|
|
218
|
+
"error_reason": None if 200 <= int(status_code) < 300 else f"u3_upload_http_{status_code}",
|
|
219
|
+
}
|
|
220
|
+
except urllib.error.HTTPError as error:
|
|
221
|
+
result = {
|
|
222
|
+
"ok": False,
|
|
223
|
+
"status_code": error.code,
|
|
224
|
+
"error_reason": f"u3_upload_http_{error.code}",
|
|
225
|
+
}
|
|
226
|
+
except urllib.error.URLError as error:
|
|
227
|
+
reason_obj = getattr(error, "reason", error)
|
|
228
|
+
reason_text = normalize_text(reason_obj)
|
|
229
|
+
result = {
|
|
230
|
+
"ok": False,
|
|
231
|
+
"status_code": None,
|
|
232
|
+
"error_reason": f"u3_upload_failed:{reason_text or 'network_error'}",
|
|
233
|
+
"_timeout_like": isinstance(reason_obj, socket.timeout)
|
|
234
|
+
or _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
|
|
235
|
+
}
|
|
236
|
+
except (TimeoutError, socket.timeout) as error:
|
|
237
|
+
result = {
|
|
238
|
+
"ok": False,
|
|
239
|
+
"status_code": None,
|
|
240
|
+
"error_reason": f"u3_upload_failed:{normalize_text(error) or 'timeout'}",
|
|
241
|
+
"_timeout_like": True,
|
|
242
|
+
}
|
|
243
|
+
except Exception as error:
|
|
244
|
+
reason_text = normalize_text(error)
|
|
245
|
+
result = {
|
|
246
|
+
"ok": False,
|
|
247
|
+
"status_code": None,
|
|
248
|
+
"error_reason": f"u3_upload_failed:{reason_text or 'unknown'}",
|
|
249
|
+
"_timeout_like": _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if result.get("ok"):
|
|
253
|
+
result["retry_attempt"] = max(0, attempt - 1)
|
|
254
|
+
result["timeout_retry_max"] = timeout_retry_max
|
|
255
|
+
result["timeout_retry_exhausted"] = False
|
|
256
|
+
result["retry_chain"] = retry_chain
|
|
257
|
+
return result
|
|
258
|
+
|
|
259
|
+
timeout_like = bool(
|
|
260
|
+
result.pop(
|
|
261
|
+
"_timeout_like",
|
|
262
|
+
_is_timeout_like_upload_error(
|
|
263
|
+
status_code=result.get("status_code"),
|
|
264
|
+
error_reason=result.get("error_reason"),
|
|
265
|
+
),
|
|
266
|
+
)
|
|
167
267
|
)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
"
|
|
172
|
-
"
|
|
173
|
-
"
|
|
268
|
+
retry_chain.append(
|
|
269
|
+
{
|
|
270
|
+
"attempt": attempt,
|
|
271
|
+
"status_code": result.get("status_code"),
|
|
272
|
+
"error_reason": result.get("error_reason"),
|
|
273
|
+
"timeout_like": timeout_like,
|
|
174
274
|
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
275
|
+
)
|
|
276
|
+
last_result = dict(result)
|
|
277
|
+
|
|
278
|
+
if timeout_like and attempt < max_attempts:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
last_result["retry_attempt"] = max(0, attempt - 1)
|
|
282
|
+
last_result["timeout_retry_max"] = timeout_retry_max
|
|
283
|
+
last_result["timeout_retry_exhausted"] = bool(timeout_like and attempt >= max_attempts)
|
|
284
|
+
last_result["retry_chain"] = retry_chain
|
|
285
|
+
return last_result
|
|
286
|
+
|
|
287
|
+
last_result["retry_attempt"] = timeout_retry_max
|
|
288
|
+
last_result["timeout_retry_max"] = timeout_retry_max
|
|
289
|
+
last_result["timeout_retry_exhausted"] = True
|
|
290
|
+
last_result["retry_chain"] = retry_chain
|
|
291
|
+
return last_result
|
|
179
292
|
|
|
180
293
|
|
|
181
294
|
def complete_u3_upload(
|
|
@@ -284,6 +397,11 @@ def run_u3_public_url_fallback(
|
|
|
284
397
|
"ok": bool(upload_response.get("ok")),
|
|
285
398
|
"status_code": upload_response.get("status_code"),
|
|
286
399
|
"error_reason": upload_response.get("error_reason"),
|
|
400
|
+
"retry_attempt": upload_response.get("retry_attempt", 0),
|
|
401
|
+
"retry_count": len(upload_response.get("retry_chain") or []),
|
|
402
|
+
"timeout_retry_max": upload_response.get("timeout_retry_max", 0),
|
|
403
|
+
"timeout_retry_exhausted": bool(upload_response.get("timeout_retry_exhausted")),
|
|
404
|
+
"retry_chain": upload_response.get("retry_chain") or [],
|
|
287
405
|
}
|
|
288
406
|
)
|
|
289
407
|
if not upload_response.get("ok"):
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared Douyin metadata extraction helpers."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from scripts.core.tikomni_common import normalize_text
|
|
10
|
+
|
|
11
|
+
INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
|
|
12
|
+
MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _safe_int(value: Any) -> Optional[int]:
|
|
16
|
+
if value is None:
|
|
17
|
+
return None
|
|
18
|
+
if isinstance(value, bool):
|
|
19
|
+
return int(value)
|
|
20
|
+
if isinstance(value, int):
|
|
21
|
+
return value
|
|
22
|
+
if isinstance(value, float):
|
|
23
|
+
return int(value)
|
|
24
|
+
|
|
25
|
+
text = normalize_text(value)
|
|
26
|
+
if not text:
|
|
27
|
+
return None
|
|
28
|
+
try:
|
|
29
|
+
return int(float(text.replace(",", "")))
|
|
30
|
+
except Exception:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def normalize_douyin_author_handle(*values: Any) -> str:
|
|
35
|
+
for value in values:
|
|
36
|
+
text = normalize_text(value)
|
|
37
|
+
if not text:
|
|
38
|
+
continue
|
|
39
|
+
if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
|
|
40
|
+
continue
|
|
41
|
+
return text
|
|
42
|
+
return ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def looks_like_douyin_music_title(value: Any) -> bool:
|
|
46
|
+
title = normalize_text(value)
|
|
47
|
+
if not title:
|
|
48
|
+
return False
|
|
49
|
+
return bool(MUSIC_TITLE_PATTERN.match(title))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def extract_douyin_caption(item: Dict[str, Any]) -> str:
|
|
53
|
+
if not isinstance(item, dict):
|
|
54
|
+
return ""
|
|
55
|
+
for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
|
|
56
|
+
text = normalize_text(item.get(key))
|
|
57
|
+
if text:
|
|
58
|
+
return text
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def title_from_douyin_caption(caption: Any) -> str:
|
|
63
|
+
text = normalize_text(caption)
|
|
64
|
+
if not text:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
|
|
68
|
+
if stripped:
|
|
69
|
+
return stripped
|
|
70
|
+
return text
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_douyin_title(item: Dict[str, Any]) -> str:
|
|
74
|
+
if not isinstance(item, dict):
|
|
75
|
+
return ""
|
|
76
|
+
|
|
77
|
+
# Only read title-like fields from the work object itself.
|
|
78
|
+
# Nested `music.title` is an audio title, not the work title.
|
|
79
|
+
caption_title = title_from_douyin_caption(extract_douyin_caption(item))
|
|
80
|
+
for key in ("item_title", "preview_title", "title"):
|
|
81
|
+
candidate = normalize_text(item.get(key))
|
|
82
|
+
if not candidate:
|
|
83
|
+
continue
|
|
84
|
+
if looks_like_douyin_music_title(candidate) and caption_title:
|
|
85
|
+
continue
|
|
86
|
+
return candidate
|
|
87
|
+
return caption_title
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
|
91
|
+
author = item.get("author") if isinstance(item.get("author"), dict) else {}
|
|
92
|
+
|
|
93
|
+
author_platform_id = (
|
|
94
|
+
normalize_text(author.get("uid"))
|
|
95
|
+
or normalize_text(author.get("id"))
|
|
96
|
+
or normalize_text(item.get("author_user_id"))
|
|
97
|
+
)
|
|
98
|
+
author_handle = normalize_douyin_author_handle(
|
|
99
|
+
author.get("unique_id"),
|
|
100
|
+
author.get("short_id"),
|
|
101
|
+
author.get("douyin_id"),
|
|
102
|
+
author.get("display_id"),
|
|
103
|
+
author.get("nickname"),
|
|
104
|
+
)
|
|
105
|
+
douyin_sec_uid = normalize_text(author.get("sec_uid"))
|
|
106
|
+
douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"author_handle": author_handle or None,
|
|
110
|
+
"platform_author_id": author_platform_id or None,
|
|
111
|
+
"author_platform_id": author_platform_id or None,
|
|
112
|
+
"douyin_sec_uid": douyin_sec_uid or None,
|
|
113
|
+
"douyin_aweme_author_id": douyin_aweme_author_id or None,
|
|
114
|
+
"unique_id": normalize_text(author.get("unique_id")) or None,
|
|
115
|
+
"nickname": normalize_text(author.get("nickname")) or None,
|
|
116
|
+
"signature": normalize_text(author.get("signature")) or None,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
|
|
121
|
+
statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
|
|
122
|
+
|
|
123
|
+
def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
|
|
124
|
+
for key in keys:
|
|
125
|
+
value = _safe_int(statistics.get(key))
|
|
126
|
+
if value is not None:
|
|
127
|
+
return value
|
|
128
|
+
value = _safe_int(item.get(key))
|
|
129
|
+
if value is not None:
|
|
130
|
+
return value
|
|
131
|
+
return default
|
|
132
|
+
|
|
133
|
+
metrics = {
|
|
134
|
+
"digg_count": metric("digg_count", "like_count"),
|
|
135
|
+
"comment_count": metric("comment_count"),
|
|
136
|
+
"collect_count": metric("collect_count"),
|
|
137
|
+
"share_count": metric("share_count", "forward_count"),
|
|
138
|
+
"play_count": metric("play_count", "view_count", default=None),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
play_count = metrics.get("play_count")
|
|
142
|
+
engagement_floor = max(
|
|
143
|
+
int(metrics.get("digg_count") or 0),
|
|
144
|
+
int(metrics.get("comment_count") or 0),
|
|
145
|
+
int(metrics.get("collect_count") or 0),
|
|
146
|
+
int(metrics.get("share_count") or 0),
|
|
147
|
+
)
|
|
148
|
+
if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
|
|
149
|
+
metrics["play_count"] = None
|
|
150
|
+
|
|
151
|
+
return metrics
|
|
@@ -12,6 +12,7 @@ from scripts.core.progress_report import ProgressReporter
|
|
|
12
12
|
from scripts.core.tikomni_common import normalize_text
|
|
13
13
|
from scripts.core.asr_pipeline import (
|
|
14
14
|
clamp_u2_batch_submit_size,
|
|
15
|
+
derive_asr_clean_text,
|
|
15
16
|
normalize_media_url,
|
|
16
17
|
run_u2_asr_batch_with_timeout_retry,
|
|
17
18
|
run_u2_asr_candidates_with_timeout_retry,
|
|
@@ -123,6 +124,30 @@ def _clean_text(text: Any) -> str:
|
|
|
123
124
|
return "\n".join(lines).strip()
|
|
124
125
|
|
|
125
126
|
|
|
127
|
+
def _build_transcript_result(
|
|
128
|
+
raw_text: Any,
|
|
129
|
+
*,
|
|
130
|
+
subtitle_source: str,
|
|
131
|
+
asr_source: str,
|
|
132
|
+
) -> Dict[str, Any]:
|
|
133
|
+
transcript = _clean_text(raw_text)
|
|
134
|
+
asr_clean = derive_asr_clean_text(transcript)
|
|
135
|
+
primary_text = asr_clean or transcript
|
|
136
|
+
return {
|
|
137
|
+
"subtitle_raw": transcript,
|
|
138
|
+
"subtitle_source": subtitle_source,
|
|
139
|
+
"asr_raw": transcript,
|
|
140
|
+
"asr_clean": asr_clean,
|
|
141
|
+
"primary_text": primary_text,
|
|
142
|
+
"primary_text_source": "asr_clean",
|
|
143
|
+
"analysis_eligibility": "eligible" if transcript else "incomplete",
|
|
144
|
+
"analysis_exclusion_reason": "" if transcript else "video_asr_unavailable",
|
|
145
|
+
"asr_status": "success" if transcript else "failed",
|
|
146
|
+
"asr_error_reason": "",
|
|
147
|
+
"asr_source": asr_source,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
126
151
|
def _subtitle_text_from_raw(raw: str) -> str:
|
|
127
152
|
content = (raw or "").strip()
|
|
128
153
|
if not content:
|
|
@@ -294,19 +319,11 @@ def _run_u2_for_work(
|
|
|
294
319
|
}
|
|
295
320
|
|
|
296
321
|
if transcript:
|
|
297
|
-
return
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
"
|
|
301
|
-
|
|
302
|
-
"primary_text": transcript,
|
|
303
|
-
"primary_text_source": "asr_clean",
|
|
304
|
-
"analysis_eligibility": "eligible",
|
|
305
|
-
"analysis_exclusion_reason": "",
|
|
306
|
-
"asr_status": "success",
|
|
307
|
-
"asr_error_reason": "",
|
|
308
|
-
"asr_source": "external_asr",
|
|
309
|
-
}, trace
|
|
322
|
+
return _build_transcript_result(
|
|
323
|
+
transcript,
|
|
324
|
+
subtitle_source="external_asr",
|
|
325
|
+
asr_source="external_asr",
|
|
326
|
+
), trace
|
|
310
327
|
|
|
311
328
|
return {
|
|
312
329
|
"subtitle_raw": "",
|
|
@@ -715,17 +732,11 @@ def _run_u2_batch_for_entries(
|
|
|
715
732
|
if (mapped_ok or mapped_status in {"SUCCEEDED", "SUCCESS", "COMPLETED", "DONE"}) and transcript:
|
|
716
733
|
for entry in grouped_entries:
|
|
717
734
|
entry["work"].update(
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
"
|
|
721
|
-
"
|
|
722
|
-
|
|
723
|
-
"analysis_eligibility": "eligible",
|
|
724
|
-
"analysis_exclusion_reason": "",
|
|
725
|
-
"asr_status": "success",
|
|
726
|
-
"asr_error_reason": "",
|
|
727
|
-
"asr_source": "external_asr",
|
|
728
|
-
}
|
|
735
|
+
_build_transcript_result(
|
|
736
|
+
transcript,
|
|
737
|
+
subtitle_source="external_asr",
|
|
738
|
+
asr_source="external_asr",
|
|
739
|
+
)
|
|
729
740
|
)
|
|
730
741
|
mapped_count += 1
|
|
731
742
|
else:
|
|
@@ -927,19 +938,11 @@ def enrich_author_home_asr(
|
|
|
927
938
|
subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
|
|
928
939
|
if subtitle_invalid is None:
|
|
929
940
|
work.update(
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
"
|
|
934
|
-
|
|
935
|
-
"primary_text": subtitle_text,
|
|
936
|
-
"primary_text_source": "asr_clean",
|
|
937
|
-
"analysis_eligibility": "eligible",
|
|
938
|
-
"analysis_exclusion_reason": "",
|
|
939
|
-
"asr_status": "success",
|
|
940
|
-
"asr_error_reason": "",
|
|
941
|
-
"asr_source": "native_subtitle",
|
|
942
|
-
}
|
|
941
|
+
_build_transcript_result(
|
|
942
|
+
subtitle_text,
|
|
943
|
+
subtitle_source="native_subtitle",
|
|
944
|
+
asr_source="native_subtitle",
|
|
945
|
+
)
|
|
943
946
|
)
|
|
944
947
|
trace.append(
|
|
945
948
|
{
|
|
@@ -9,6 +9,7 @@ from urllib.parse import parse_qs, urlparse
|
|
|
9
9
|
from scripts.core.extract_pipeline import build_api_trace
|
|
10
10
|
from scripts.core.progress_report import ProgressReporter
|
|
11
11
|
from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
|
|
12
|
+
from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def _to_text(value: Any) -> str:
|
|
@@ -359,23 +360,16 @@ def _call_xhs_route(
|
|
|
359
360
|
|
|
360
361
|
|
|
361
362
|
def _guess_douyin_sec_user_id(input_value: str) -> str:
|
|
362
|
-
|
|
363
|
-
if not value:
|
|
364
|
-
return ""
|
|
365
|
-
if "sec_uid=" in value:
|
|
366
|
-
query = parse_qs(urlparse(value).query)
|
|
367
|
-
sec = query.get("sec_uid") or query.get("sec_user_id")
|
|
368
|
-
if sec and sec[0]:
|
|
369
|
-
return sec[0]
|
|
370
|
-
if value.startswith("MS4wLjAB") or value.startswith("MS4wLjA"):
|
|
371
|
-
return value
|
|
372
|
-
return ""
|
|
363
|
+
return str(extract_douyin_sec_uid(input_value) or "")
|
|
373
364
|
|
|
374
365
|
|
|
375
366
|
def _guess_xhs_ids(input_value: str) -> Tuple[str, str]:
|
|
376
367
|
value = (input_value or "").strip()
|
|
377
368
|
if not value:
|
|
378
369
|
return "", ""
|
|
370
|
+
direct_user_id = str(extract_xhs_user_id(value) or "")
|
|
371
|
+
if direct_user_id and looks_like_xhs_user_id(direct_user_id) and not value.startswith(("http://", "https://")):
|
|
372
|
+
return direct_user_id, ""
|
|
379
373
|
parsed = urlparse(value)
|
|
380
374
|
if parsed.query:
|
|
381
375
|
query = parse_qs(parsed.query)
|