@tikomni/skills 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tikomni/skills",
3
- "version": "0.1.7",
3
+ "version": "0.1.9",
4
4
  "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
@@ -257,6 +257,13 @@ def _resolve_timeout_retry_backoff_ms() -> int:
257
257
  return max(0, min(backoff, 5000))
258
258
 
259
259
 
260
+ def resolve_timeout_retry_policy() -> Dict[str, int]:
261
+ return {
262
+ "max_retries": _resolve_timeout_retry_max(),
263
+ "backoff_ms": _resolve_timeout_retry_backoff_ms(),
264
+ }
265
+
266
+
260
267
  def _wait_rate_limit_slot(qps: float) -> int:
261
268
  global _NEXT_ALLOWED_TS
262
269
  interval_sec = 1.0 / max(qps, 0.1)
@@ -5,18 +5,26 @@ from __future__ import annotations
5
5
 
6
6
  import mimetypes
7
7
  import os
8
+ import socket
8
9
  import tempfile
10
+ import time
9
11
  import urllib.error
10
12
  import urllib.parse
11
13
  import urllib.request
12
14
  from pathlib import Path
13
- from typing import Any, Dict, Optional
15
+ from typing import Any, Dict, List, Optional
14
16
 
15
- from scripts.core.tikomni_common import DEFAULT_USER_AGENT, call_json_api, normalize_text
17
+ from scripts.core.tikomni_common import (
18
+ DEFAULT_USER_AGENT,
19
+ call_json_api,
20
+ normalize_text,
21
+ resolve_timeout_retry_policy,
22
+ )
16
23
 
17
24
  DEFAULT_U3_PROVIDER = "oss"
18
25
  DEFAULT_CONTENT_TYPE = "video/mp4"
19
26
  DOWNLOAD_CHUNK_SIZE = 1024 * 1024
27
+ TIMEOUT_LIKE_HTTP_STATUS_CODES = {408, 429, 502, 503, 504}
20
28
 
21
29
 
22
30
  def _safe_name_from_url(source_url: str) -> str:
@@ -135,6 +143,16 @@ def create_u3_upload(
135
143
  )
136
144
 
137
145
 
146
+ def _is_timeout_like_upload_error(status_code: Optional[int], error_reason: Optional[str]) -> bool:
147
+ if isinstance(status_code, (int, float)) and int(status_code) in TIMEOUT_LIKE_HTTP_STATUS_CODES:
148
+ return True
149
+
150
+ reason = str(error_reason or "").strip().lower()
151
+ if not reason:
152
+ return False
153
+ return any(token in reason for token in ("timeout", "timed out", "deadline exceeded"))
154
+
155
+
138
156
  def upload_file_to_presigned_url(
139
157
  *,
140
158
  upload_url: str,
@@ -147,35 +165,130 @@ def upload_file_to_presigned_url(
147
165
  try:
148
166
  with open(file_path, "rb") as handle:
149
167
  data = handle.read()
150
-
151
- headers = {
152
- "Content-Type": content_type or DEFAULT_CONTENT_TYPE,
153
- "User-Agent": os.getenv("TIKOMNI_HTTP_USER_AGENT", DEFAULT_USER_AGENT),
168
+ except Exception as error:
169
+ return {
170
+ "ok": False,
171
+ "status_code": None,
172
+ "error_reason": f"u3_upload_failed:{normalize_text(error)}",
173
+ "retry_attempt": 0,
174
+ "timeout_retry_max": 0,
175
+ "timeout_retry_exhausted": False,
176
+ "retry_chain": [],
154
177
  }
155
- if isinstance(upload_headers, dict):
156
- for key, value in upload_headers.items():
157
- header_key = str(key).strip()
158
- if not header_key:
159
- continue
160
- headers[header_key] = str(value)
161
-
162
- request = urllib.request.Request(
163
- upload_url,
164
- data=data,
165
- headers=headers,
166
- method=(upload_method or "PUT").upper(),
178
+
179
+ headers = {
180
+ "Content-Type": content_type or DEFAULT_CONTENT_TYPE,
181
+ "User-Agent": os.getenv("TIKOMNI_HTTP_USER_AGENT", DEFAULT_USER_AGENT),
182
+ }
183
+ if isinstance(upload_headers, dict):
184
+ for key, value in upload_headers.items():
185
+ header_key = str(key).strip()
186
+ if not header_key:
187
+ continue
188
+ headers[header_key] = str(value)
189
+
190
+ retry_policy = resolve_timeout_retry_policy()
191
+ timeout_retry_max = int(retry_policy.get("max_retries", 0) or 0)
192
+ retry_backoff_ms = int(retry_policy.get("backoff_ms", 0) or 0)
193
+ max_attempts = 1 + timeout_retry_max
194
+ retry_chain: List[Dict[str, Any]] = []
195
+ last_result: Dict[str, Any] = {
196
+ "ok": False,
197
+ "status_code": None,
198
+ "error_reason": "u3_upload_failed:unknown",
199
+ }
200
+
201
+ for attempt in range(1, max_attempts + 1):
202
+ if attempt > 1 and retry_backoff_ms > 0:
203
+ sleep_ms = retry_backoff_ms * (2 ** (attempt - 2))
204
+ time.sleep(sleep_ms / 1000.0)
205
+
206
+ try:
207
+ request = urllib.request.Request(
208
+ upload_url,
209
+ data=data,
210
+ headers=headers,
211
+ method=(upload_method or "PUT").upper(),
212
+ )
213
+ with urllib.request.urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
214
+ status_code = response.getcode()
215
+ result: Dict[str, Any] = {
216
+ "ok": 200 <= int(status_code) < 300,
217
+ "status_code": status_code,
218
+ "error_reason": None if 200 <= int(status_code) < 300 else f"u3_upload_http_{status_code}",
219
+ }
220
+ except urllib.error.HTTPError as error:
221
+ result = {
222
+ "ok": False,
223
+ "status_code": error.code,
224
+ "error_reason": f"u3_upload_http_{error.code}",
225
+ }
226
+ except urllib.error.URLError as error:
227
+ reason_obj = getattr(error, "reason", error)
228
+ reason_text = normalize_text(reason_obj)
229
+ result = {
230
+ "ok": False,
231
+ "status_code": None,
232
+ "error_reason": f"u3_upload_failed:{reason_text or 'network_error'}",
233
+ "_timeout_like": isinstance(reason_obj, socket.timeout)
234
+ or _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
235
+ }
236
+ except (TimeoutError, socket.timeout) as error:
237
+ result = {
238
+ "ok": False,
239
+ "status_code": None,
240
+ "error_reason": f"u3_upload_failed:{normalize_text(error) or 'timeout'}",
241
+ "_timeout_like": True,
242
+ }
243
+ except Exception as error:
244
+ reason_text = normalize_text(error)
245
+ result = {
246
+ "ok": False,
247
+ "status_code": None,
248
+ "error_reason": f"u3_upload_failed:{reason_text or 'unknown'}",
249
+ "_timeout_like": _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
250
+ }
251
+
252
+ if result.get("ok"):
253
+ result["retry_attempt"] = max(0, attempt - 1)
254
+ result["timeout_retry_max"] = timeout_retry_max
255
+ result["timeout_retry_exhausted"] = False
256
+ result["retry_chain"] = retry_chain
257
+ return result
258
+
259
+ timeout_like = bool(
260
+ result.pop(
261
+ "_timeout_like",
262
+ _is_timeout_like_upload_error(
263
+ status_code=result.get("status_code"),
264
+ error_reason=result.get("error_reason"),
265
+ ),
266
+ )
167
267
  )
168
- with urllib.request.urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
169
- status_code = response.getcode()
170
- return {
171
- "ok": 200 <= int(status_code) < 300,
172
- "status_code": status_code,
173
- "error_reason": None if 200 <= int(status_code) < 300 else f"u3_upload_http_{status_code}",
268
+ retry_chain.append(
269
+ {
270
+ "attempt": attempt,
271
+ "status_code": result.get("status_code"),
272
+ "error_reason": result.get("error_reason"),
273
+ "timeout_like": timeout_like,
174
274
  }
175
- except urllib.error.HTTPError as error:
176
- return {"ok": False, "status_code": error.code, "error_reason": f"u3_upload_http_{error.code}"}
177
- except Exception as error:
178
- return {"ok": False, "status_code": None, "error_reason": f"u3_upload_failed:{normalize_text(error)}"}
275
+ )
276
+ last_result = dict(result)
277
+
278
+ if timeout_like and attempt < max_attempts:
279
+ continue
280
+
281
+ last_result["retry_attempt"] = max(0, attempt - 1)
282
+ last_result["timeout_retry_max"] = timeout_retry_max
283
+ last_result["timeout_retry_exhausted"] = bool(timeout_like and attempt >= max_attempts)
284
+ last_result["retry_chain"] = retry_chain
285
+ return last_result
286
+
287
+ last_result["retry_attempt"] = timeout_retry_max
288
+ last_result["timeout_retry_max"] = timeout_retry_max
289
+ last_result["timeout_retry_exhausted"] = True
290
+ last_result["retry_chain"] = retry_chain
291
+ return last_result
179
292
 
180
293
 
181
294
  def complete_u3_upload(
@@ -284,6 +397,11 @@ def run_u3_public_url_fallback(
284
397
  "ok": bool(upload_response.get("ok")),
285
398
  "status_code": upload_response.get("status_code"),
286
399
  "error_reason": upload_response.get("error_reason"),
400
+ "retry_attempt": upload_response.get("retry_attempt", 0),
401
+ "retry_count": len(upload_response.get("retry_chain") or []),
402
+ "timeout_retry_max": upload_response.get("timeout_retry_max", 0),
403
+ "timeout_retry_exhausted": bool(upload_response.get("timeout_retry_exhausted")),
404
+ "retry_chain": upload_response.get("retry_chain") or [],
287
405
  }
288
406
  )
289
407
  if not upload_response.get("ok"):
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env python3
2
+ """Shared Douyin metadata extraction helpers."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import re
7
+ from typing import Any, Dict, Optional
8
+
9
+ from scripts.core.tikomni_common import normalize_text
10
+
11
+ INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
12
+ MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
13
+
14
+
15
+ def _safe_int(value: Any) -> Optional[int]:
16
+ if value is None:
17
+ return None
18
+ if isinstance(value, bool):
19
+ return int(value)
20
+ if isinstance(value, int):
21
+ return value
22
+ if isinstance(value, float):
23
+ return int(value)
24
+
25
+ text = normalize_text(value)
26
+ if not text:
27
+ return None
28
+ try:
29
+ return int(float(text.replace(",", "")))
30
+ except Exception:
31
+ return None
32
+
33
+
34
+ def normalize_douyin_author_handle(*values: Any) -> str:
35
+ for value in values:
36
+ text = normalize_text(value)
37
+ if not text:
38
+ continue
39
+ if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
40
+ continue
41
+ return text
42
+ return ""
43
+
44
+
45
+ def looks_like_douyin_music_title(value: Any) -> bool:
46
+ title = normalize_text(value)
47
+ if not title:
48
+ return False
49
+ return bool(MUSIC_TITLE_PATTERN.match(title))
50
+
51
+
52
+ def extract_douyin_caption(item: Dict[str, Any]) -> str:
53
+ if not isinstance(item, dict):
54
+ return ""
55
+ for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
56
+ text = normalize_text(item.get(key))
57
+ if text:
58
+ return text
59
+ return ""
60
+
61
+
62
+ def title_from_douyin_caption(caption: Any) -> str:
63
+ text = normalize_text(caption)
64
+ if not text:
65
+ return ""
66
+
67
+ stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
68
+ if stripped:
69
+ return stripped
70
+ return text
71
+
72
+
73
+ def extract_douyin_title(item: Dict[str, Any]) -> str:
74
+ if not isinstance(item, dict):
75
+ return ""
76
+
77
+ # Only read title-like fields from the work object itself.
78
+ # Nested `music.title` is an audio title, not the work title.
79
+ caption_title = title_from_douyin_caption(extract_douyin_caption(item))
80
+ for key in ("item_title", "preview_title", "title"):
81
+ candidate = normalize_text(item.get(key))
82
+ if not candidate:
83
+ continue
84
+ if looks_like_douyin_music_title(candidate) and caption_title:
85
+ continue
86
+ return candidate
87
+ return caption_title
88
+
89
+
90
+ def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
91
+ author = item.get("author") if isinstance(item.get("author"), dict) else {}
92
+
93
+ author_platform_id = (
94
+ normalize_text(author.get("uid"))
95
+ or normalize_text(author.get("id"))
96
+ or normalize_text(item.get("author_user_id"))
97
+ )
98
+ author_handle = normalize_douyin_author_handle(
99
+ author.get("unique_id"),
100
+ author.get("short_id"),
101
+ author.get("douyin_id"),
102
+ author.get("display_id"),
103
+ author.get("nickname"),
104
+ )
105
+ douyin_sec_uid = normalize_text(author.get("sec_uid"))
106
+ douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
107
+
108
+ return {
109
+ "author_handle": author_handle or None,
110
+ "platform_author_id": author_platform_id or None,
111
+ "author_platform_id": author_platform_id or None,
112
+ "douyin_sec_uid": douyin_sec_uid or None,
113
+ "douyin_aweme_author_id": douyin_aweme_author_id or None,
114
+ "unique_id": normalize_text(author.get("unique_id")) or None,
115
+ "nickname": normalize_text(author.get("nickname")) or None,
116
+ "signature": normalize_text(author.get("signature")) or None,
117
+ }
118
+
119
+
120
+ def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
121
+ statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
122
+
123
+ def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
124
+ for key in keys:
125
+ value = _safe_int(statistics.get(key))
126
+ if value is not None:
127
+ return value
128
+ value = _safe_int(item.get(key))
129
+ if value is not None:
130
+ return value
131
+ return default
132
+
133
+ metrics = {
134
+ "digg_count": metric("digg_count", "like_count"),
135
+ "comment_count": metric("comment_count"),
136
+ "collect_count": metric("collect_count"),
137
+ "share_count": metric("share_count", "forward_count"),
138
+ "play_count": metric("play_count", "view_count", default=None),
139
+ }
140
+
141
+ play_count = metrics.get("play_count")
142
+ engagement_floor = max(
143
+ int(metrics.get("digg_count") or 0),
144
+ int(metrics.get("comment_count") or 0),
145
+ int(metrics.get("collect_count") or 0),
146
+ int(metrics.get("share_count") or 0),
147
+ )
148
+ if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
149
+ metrics["play_count"] = None
150
+
151
+ return metrics
@@ -12,6 +12,7 @@ from scripts.core.progress_report import ProgressReporter
12
12
  from scripts.core.tikomni_common import normalize_text
13
13
  from scripts.core.asr_pipeline import (
14
14
  clamp_u2_batch_submit_size,
15
+ derive_asr_clean_text,
15
16
  normalize_media_url,
16
17
  run_u2_asr_batch_with_timeout_retry,
17
18
  run_u2_asr_candidates_with_timeout_retry,
@@ -123,6 +124,30 @@ def _clean_text(text: Any) -> str:
123
124
  return "\n".join(lines).strip()
124
125
 
125
126
 
127
+ def _build_transcript_result(
128
+ raw_text: Any,
129
+ *,
130
+ subtitle_source: str,
131
+ asr_source: str,
132
+ ) -> Dict[str, Any]:
133
+ transcript = _clean_text(raw_text)
134
+ asr_clean = derive_asr_clean_text(transcript)
135
+ primary_text = asr_clean or transcript
136
+ return {
137
+ "subtitle_raw": transcript,
138
+ "subtitle_source": subtitle_source,
139
+ "asr_raw": transcript,
140
+ "asr_clean": asr_clean,
141
+ "primary_text": primary_text,
142
+ "primary_text_source": "asr_clean",
143
+ "analysis_eligibility": "eligible" if transcript else "incomplete",
144
+ "analysis_exclusion_reason": "" if transcript else "video_asr_unavailable",
145
+ "asr_status": "success" if transcript else "failed",
146
+ "asr_error_reason": "",
147
+ "asr_source": asr_source,
148
+ }
149
+
150
+
126
151
  def _subtitle_text_from_raw(raw: str) -> str:
127
152
  content = (raw or "").strip()
128
153
  if not content:
@@ -294,19 +319,11 @@ def _run_u2_for_work(
294
319
  }
295
320
 
296
321
  if transcript:
297
- return {
298
- "subtitle_raw": transcript,
299
- "subtitle_source": "external_asr",
300
- "asr_raw": transcript,
301
- "asr_clean": transcript,
302
- "primary_text": transcript,
303
- "primary_text_source": "asr_clean",
304
- "analysis_eligibility": "eligible",
305
- "analysis_exclusion_reason": "",
306
- "asr_status": "success",
307
- "asr_error_reason": "",
308
- "asr_source": "external_asr",
309
- }, trace
322
+ return _build_transcript_result(
323
+ transcript,
324
+ subtitle_source="external_asr",
325
+ asr_source="external_asr",
326
+ ), trace
310
327
 
311
328
  return {
312
329
  "subtitle_raw": "",
@@ -715,17 +732,11 @@ def _run_u2_batch_for_entries(
715
732
  if (mapped_ok or mapped_status in {"SUCCEEDED", "SUCCESS", "COMPLETED", "DONE"}) and transcript:
716
733
  for entry in grouped_entries:
717
734
  entry["work"].update(
718
- {
719
- "asr_raw": transcript,
720
- "asr_clean": transcript,
721
- "primary_text": transcript,
722
- "primary_text_source": "asr_clean",
723
- "analysis_eligibility": "eligible",
724
- "analysis_exclusion_reason": "",
725
- "asr_status": "success",
726
- "asr_error_reason": "",
727
- "asr_source": "external_asr",
728
- }
735
+ _build_transcript_result(
736
+ transcript,
737
+ subtitle_source="external_asr",
738
+ asr_source="external_asr",
739
+ )
729
740
  )
730
741
  mapped_count += 1
731
742
  else:
@@ -927,19 +938,11 @@ def enrich_author_home_asr(
927
938
  subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
928
939
  if subtitle_invalid is None:
929
940
  work.update(
930
- {
931
- "subtitle_raw": subtitle_text,
932
- "subtitle_source": "native_subtitle",
933
- "asr_raw": subtitle_text,
934
- "asr_clean": subtitle_text,
935
- "primary_text": subtitle_text,
936
- "primary_text_source": "asr_clean",
937
- "analysis_eligibility": "eligible",
938
- "analysis_exclusion_reason": "",
939
- "asr_status": "success",
940
- "asr_error_reason": "",
941
- "asr_source": "native_subtitle",
942
- }
941
+ _build_transcript_result(
942
+ subtitle_text,
943
+ subtitle_source="native_subtitle",
944
+ asr_source="native_subtitle",
945
+ )
943
946
  )
944
947
  trace.append(
945
948
  {
@@ -9,6 +9,7 @@ from urllib.parse import parse_qs, urlparse
9
9
  from scripts.core.extract_pipeline import build_api_trace
10
10
  from scripts.core.progress_report import ProgressReporter
11
11
  from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
12
+ from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
12
13
 
13
14
 
14
15
  def _to_text(value: Any) -> str:
@@ -359,23 +360,16 @@ def _call_xhs_route(
359
360
 
360
361
 
361
362
  def _guess_douyin_sec_user_id(input_value: str) -> str:
362
- value = (input_value or "").strip()
363
- if not value:
364
- return ""
365
- if "sec_uid=" in value:
366
- query = parse_qs(urlparse(value).query)
367
- sec = query.get("sec_uid") or query.get("sec_user_id")
368
- if sec and sec[0]:
369
- return sec[0]
370
- if value.startswith("MS4wLjAB") or value.startswith("MS4wLjA"):
371
- return value
372
- return ""
363
+ return str(extract_douyin_sec_uid(input_value) or "")
373
364
 
374
365
 
375
366
  def _guess_xhs_ids(input_value: str) -> Tuple[str, str]:
376
367
  value = (input_value or "").strip()
377
368
  if not value:
378
369
  return "", ""
370
+ direct_user_id = str(extract_xhs_user_id(value) or "")
371
+ if direct_user_id and looks_like_xhs_user_id(direct_user_id) and not value.startswith(("http://", "https://")):
372
+ return direct_user_id, ""
379
373
  parsed = urlparse(value)
380
374
  if parsed.query:
381
375
  query = parse_qs(parsed.query)