@tikomni/skills 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/pipelines/douyin_metadata.py +151 -0
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +40 -37
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +5 -11
- package/skills/social-media-crawl/scripts/pipelines/input_contracts.py +318 -0
- package/skills/social-media-crawl/scripts/pipelines/media_url_rules.py +86 -0
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +77 -30
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +79 -73
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +86 -60
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +5 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared Douyin metadata extraction helpers."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from scripts.core.tikomni_common import normalize_text
|
|
10
|
+
|
|
11
|
+
INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
|
|
12
|
+
MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _safe_int(value: Any) -> Optional[int]:
|
|
16
|
+
if value is None:
|
|
17
|
+
return None
|
|
18
|
+
if isinstance(value, bool):
|
|
19
|
+
return int(value)
|
|
20
|
+
if isinstance(value, int):
|
|
21
|
+
return value
|
|
22
|
+
if isinstance(value, float):
|
|
23
|
+
return int(value)
|
|
24
|
+
|
|
25
|
+
text = normalize_text(value)
|
|
26
|
+
if not text:
|
|
27
|
+
return None
|
|
28
|
+
try:
|
|
29
|
+
return int(float(text.replace(",", "")))
|
|
30
|
+
except Exception:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def normalize_douyin_author_handle(*values: Any) -> str:
|
|
35
|
+
for value in values:
|
|
36
|
+
text = normalize_text(value)
|
|
37
|
+
if not text:
|
|
38
|
+
continue
|
|
39
|
+
if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
|
|
40
|
+
continue
|
|
41
|
+
return text
|
|
42
|
+
return ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def looks_like_douyin_music_title(value: Any) -> bool:
|
|
46
|
+
title = normalize_text(value)
|
|
47
|
+
if not title:
|
|
48
|
+
return False
|
|
49
|
+
return bool(MUSIC_TITLE_PATTERN.match(title))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def extract_douyin_caption(item: Dict[str, Any]) -> str:
|
|
53
|
+
if not isinstance(item, dict):
|
|
54
|
+
return ""
|
|
55
|
+
for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
|
|
56
|
+
text = normalize_text(item.get(key))
|
|
57
|
+
if text:
|
|
58
|
+
return text
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def title_from_douyin_caption(caption: Any) -> str:
|
|
63
|
+
text = normalize_text(caption)
|
|
64
|
+
if not text:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
|
|
68
|
+
if stripped:
|
|
69
|
+
return stripped
|
|
70
|
+
return text
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_douyin_title(item: Dict[str, Any]) -> str:
|
|
74
|
+
if not isinstance(item, dict):
|
|
75
|
+
return ""
|
|
76
|
+
|
|
77
|
+
# Only read title-like fields from the work object itself.
|
|
78
|
+
# Nested `music.title` is an audio title, not the work title.
|
|
79
|
+
caption_title = title_from_douyin_caption(extract_douyin_caption(item))
|
|
80
|
+
for key in ("item_title", "preview_title", "title"):
|
|
81
|
+
candidate = normalize_text(item.get(key))
|
|
82
|
+
if not candidate:
|
|
83
|
+
continue
|
|
84
|
+
if looks_like_douyin_music_title(candidate) and caption_title:
|
|
85
|
+
continue
|
|
86
|
+
return candidate
|
|
87
|
+
return caption_title
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
|
91
|
+
author = item.get("author") if isinstance(item.get("author"), dict) else {}
|
|
92
|
+
|
|
93
|
+
author_platform_id = (
|
|
94
|
+
normalize_text(author.get("uid"))
|
|
95
|
+
or normalize_text(author.get("id"))
|
|
96
|
+
or normalize_text(item.get("author_user_id"))
|
|
97
|
+
)
|
|
98
|
+
author_handle = normalize_douyin_author_handle(
|
|
99
|
+
author.get("unique_id"),
|
|
100
|
+
author.get("short_id"),
|
|
101
|
+
author.get("douyin_id"),
|
|
102
|
+
author.get("display_id"),
|
|
103
|
+
author.get("nickname"),
|
|
104
|
+
)
|
|
105
|
+
douyin_sec_uid = normalize_text(author.get("sec_uid"))
|
|
106
|
+
douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"author_handle": author_handle or None,
|
|
110
|
+
"platform_author_id": author_platform_id or None,
|
|
111
|
+
"author_platform_id": author_platform_id or None,
|
|
112
|
+
"douyin_sec_uid": douyin_sec_uid or None,
|
|
113
|
+
"douyin_aweme_author_id": douyin_aweme_author_id or None,
|
|
114
|
+
"unique_id": normalize_text(author.get("unique_id")) or None,
|
|
115
|
+
"nickname": normalize_text(author.get("nickname")) or None,
|
|
116
|
+
"signature": normalize_text(author.get("signature")) or None,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
|
|
121
|
+
statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
|
|
122
|
+
|
|
123
|
+
def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
|
|
124
|
+
for key in keys:
|
|
125
|
+
value = _safe_int(statistics.get(key))
|
|
126
|
+
if value is not None:
|
|
127
|
+
return value
|
|
128
|
+
value = _safe_int(item.get(key))
|
|
129
|
+
if value is not None:
|
|
130
|
+
return value
|
|
131
|
+
return default
|
|
132
|
+
|
|
133
|
+
metrics = {
|
|
134
|
+
"digg_count": metric("digg_count", "like_count"),
|
|
135
|
+
"comment_count": metric("comment_count"),
|
|
136
|
+
"collect_count": metric("collect_count"),
|
|
137
|
+
"share_count": metric("share_count", "forward_count"),
|
|
138
|
+
"play_count": metric("play_count", "view_count", default=None),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
play_count = metrics.get("play_count")
|
|
142
|
+
engagement_floor = max(
|
|
143
|
+
int(metrics.get("digg_count") or 0),
|
|
144
|
+
int(metrics.get("comment_count") or 0),
|
|
145
|
+
int(metrics.get("collect_count") or 0),
|
|
146
|
+
int(metrics.get("share_count") or 0),
|
|
147
|
+
)
|
|
148
|
+
if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
|
|
149
|
+
metrics["play_count"] = None
|
|
150
|
+
|
|
151
|
+
return metrics
|
|
@@ -12,6 +12,7 @@ from scripts.core.progress_report import ProgressReporter
|
|
|
12
12
|
from scripts.core.tikomni_common import normalize_text
|
|
13
13
|
from scripts.core.asr_pipeline import (
|
|
14
14
|
clamp_u2_batch_submit_size,
|
|
15
|
+
derive_asr_clean_text,
|
|
15
16
|
normalize_media_url,
|
|
16
17
|
run_u2_asr_batch_with_timeout_retry,
|
|
17
18
|
run_u2_asr_candidates_with_timeout_retry,
|
|
@@ -123,6 +124,30 @@ def _clean_text(text: Any) -> str:
|
|
|
123
124
|
return "\n".join(lines).strip()
|
|
124
125
|
|
|
125
126
|
|
|
127
|
+
def _build_transcript_result(
|
|
128
|
+
raw_text: Any,
|
|
129
|
+
*,
|
|
130
|
+
subtitle_source: str,
|
|
131
|
+
asr_source: str,
|
|
132
|
+
) -> Dict[str, Any]:
|
|
133
|
+
transcript = _clean_text(raw_text)
|
|
134
|
+
asr_clean = derive_asr_clean_text(transcript)
|
|
135
|
+
primary_text = asr_clean or transcript
|
|
136
|
+
return {
|
|
137
|
+
"subtitle_raw": transcript,
|
|
138
|
+
"subtitle_source": subtitle_source,
|
|
139
|
+
"asr_raw": transcript,
|
|
140
|
+
"asr_clean": asr_clean,
|
|
141
|
+
"primary_text": primary_text,
|
|
142
|
+
"primary_text_source": "asr_clean",
|
|
143
|
+
"analysis_eligibility": "eligible" if transcript else "incomplete",
|
|
144
|
+
"analysis_exclusion_reason": "" if transcript else "video_asr_unavailable",
|
|
145
|
+
"asr_status": "success" if transcript else "failed",
|
|
146
|
+
"asr_error_reason": "",
|
|
147
|
+
"asr_source": asr_source,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
126
151
|
def _subtitle_text_from_raw(raw: str) -> str:
|
|
127
152
|
content = (raw or "").strip()
|
|
128
153
|
if not content:
|
|
@@ -294,19 +319,11 @@ def _run_u2_for_work(
|
|
|
294
319
|
}
|
|
295
320
|
|
|
296
321
|
if transcript:
|
|
297
|
-
return
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
"
|
|
301
|
-
|
|
302
|
-
"primary_text": transcript,
|
|
303
|
-
"primary_text_source": "asr_clean",
|
|
304
|
-
"analysis_eligibility": "eligible",
|
|
305
|
-
"analysis_exclusion_reason": "",
|
|
306
|
-
"asr_status": "success",
|
|
307
|
-
"asr_error_reason": "",
|
|
308
|
-
"asr_source": "external_asr",
|
|
309
|
-
}, trace
|
|
322
|
+
return _build_transcript_result(
|
|
323
|
+
transcript,
|
|
324
|
+
subtitle_source="external_asr",
|
|
325
|
+
asr_source="external_asr",
|
|
326
|
+
), trace
|
|
310
327
|
|
|
311
328
|
return {
|
|
312
329
|
"subtitle_raw": "",
|
|
@@ -715,17 +732,11 @@ def _run_u2_batch_for_entries(
|
|
|
715
732
|
if (mapped_ok or mapped_status in {"SUCCEEDED", "SUCCESS", "COMPLETED", "DONE"}) and transcript:
|
|
716
733
|
for entry in grouped_entries:
|
|
717
734
|
entry["work"].update(
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
"
|
|
721
|
-
"
|
|
722
|
-
|
|
723
|
-
"analysis_eligibility": "eligible",
|
|
724
|
-
"analysis_exclusion_reason": "",
|
|
725
|
-
"asr_status": "success",
|
|
726
|
-
"asr_error_reason": "",
|
|
727
|
-
"asr_source": "external_asr",
|
|
728
|
-
}
|
|
735
|
+
_build_transcript_result(
|
|
736
|
+
transcript,
|
|
737
|
+
subtitle_source="external_asr",
|
|
738
|
+
asr_source="external_asr",
|
|
739
|
+
)
|
|
729
740
|
)
|
|
730
741
|
mapped_count += 1
|
|
731
742
|
else:
|
|
@@ -927,19 +938,11 @@ def enrich_author_home_asr(
|
|
|
927
938
|
subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
|
|
928
939
|
if subtitle_invalid is None:
|
|
929
940
|
work.update(
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
"
|
|
934
|
-
|
|
935
|
-
"primary_text": subtitle_text,
|
|
936
|
-
"primary_text_source": "asr_clean",
|
|
937
|
-
"analysis_eligibility": "eligible",
|
|
938
|
-
"analysis_exclusion_reason": "",
|
|
939
|
-
"asr_status": "success",
|
|
940
|
-
"asr_error_reason": "",
|
|
941
|
-
"asr_source": "native_subtitle",
|
|
942
|
-
}
|
|
941
|
+
_build_transcript_result(
|
|
942
|
+
subtitle_text,
|
|
943
|
+
subtitle_source="native_subtitle",
|
|
944
|
+
asr_source="native_subtitle",
|
|
945
|
+
)
|
|
943
946
|
)
|
|
944
947
|
trace.append(
|
|
945
948
|
{
|
|
@@ -9,6 +9,7 @@ from urllib.parse import parse_qs, urlparse
|
|
|
9
9
|
from scripts.core.extract_pipeline import build_api_trace
|
|
10
10
|
from scripts.core.progress_report import ProgressReporter
|
|
11
11
|
from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
|
|
12
|
+
from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def _to_text(value: Any) -> str:
|
|
@@ -359,23 +360,16 @@ def _call_xhs_route(
|
|
|
359
360
|
|
|
360
361
|
|
|
361
362
|
def _guess_douyin_sec_user_id(input_value: str) -> str:
|
|
362
|
-
|
|
363
|
-
if not value:
|
|
364
|
-
return ""
|
|
365
|
-
if "sec_uid=" in value:
|
|
366
|
-
query = parse_qs(urlparse(value).query)
|
|
367
|
-
sec = query.get("sec_uid") or query.get("sec_user_id")
|
|
368
|
-
if sec and sec[0]:
|
|
369
|
-
return sec[0]
|
|
370
|
-
if value.startswith("MS4wLjAB") or value.startswith("MS4wLjA"):
|
|
371
|
-
return value
|
|
372
|
-
return ""
|
|
363
|
+
return str(extract_douyin_sec_uid(input_value) or "")
|
|
373
364
|
|
|
374
365
|
|
|
375
366
|
def _guess_xhs_ids(input_value: str) -> Tuple[str, str]:
|
|
376
367
|
value = (input_value or "").strip()
|
|
377
368
|
if not value:
|
|
378
369
|
return "", ""
|
|
370
|
+
direct_user_id = str(extract_xhs_user_id(value) or "")
|
|
371
|
+
if direct_user_id and looks_like_xhs_user_id(direct_user_id) and not value.startswith(("http://", "https://")):
|
|
372
|
+
return direct_user_id, ""
|
|
379
373
|
parsed = urlparse(value)
|
|
380
374
|
if parsed.query:
|
|
381
375
|
query = parse_qs(parsed.query)
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared input normalization and validation for social-media pipelines."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
from urllib.parse import parse_qs, urlparse
|
|
9
|
+
|
|
10
|
+
from scripts.core.tikomni_common import normalize_text
|
|
11
|
+
|
|
12
|
+
_HTTP_URL_RE = re.compile(r"https?://[^\s<>'\",。!?;:)】》]+", re.IGNORECASE)
|
|
13
|
+
_URL_TRAILING_PUNCTUATION = ".,!?;:)]}>'\",。!?;:)】》、"
|
|
14
|
+
_XHS_NOTE_ID_RE = re.compile(r"^[0-9A-Za-z]{16,32}$")
|
|
15
|
+
_XHS_USER_ID_RE = re.compile(r"^[0-9A-Za-z]{8,32}$")
|
|
16
|
+
_DOUYIN_SEC_UID_RE = re.compile(r"^MS4wLjA[A-Za-z0-9_-]{8,}$")
|
|
17
|
+
|
|
18
|
+
_DOUYIN_HOST_TOKENS = ("douyin.com", "iesdouyin.com", "v.douyin.com")
|
|
19
|
+
_XHS_HOST_TOKENS = ("xiaohongshu.com", "xhslink.com")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _dedupe_keep_order(items: List[str]) -> List[str]:
|
|
23
|
+
unique: List[str] = []
|
|
24
|
+
seen = set()
|
|
25
|
+
for item in items:
|
|
26
|
+
if item in seen:
|
|
27
|
+
continue
|
|
28
|
+
unique.append(item)
|
|
29
|
+
seen.add(item)
|
|
30
|
+
return unique
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _strip_url_punctuation(value: str) -> str:
|
|
34
|
+
return value.rstrip(_URL_TRAILING_PUNCTUATION)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def extract_http_urls(text: Optional[str]) -> List[str]:
|
|
38
|
+
raw = normalize_text(text)
|
|
39
|
+
if not raw:
|
|
40
|
+
return []
|
|
41
|
+
matches = [_strip_url_punctuation(match.group(0)) for match in _HTTP_URL_RE.finditer(raw)]
|
|
42
|
+
return _dedupe_keep_order([item for item in matches if item])
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _first_platform_url(text: Optional[str], host_tokens: tuple[str, ...]) -> Optional[str]:
|
|
46
|
+
for url in extract_http_urls(text):
|
|
47
|
+
host = urlparse(url).netloc.lower()
|
|
48
|
+
if any(token in host for token in host_tokens):
|
|
49
|
+
return url
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _text_contains_host(text: Optional[str], host_tokens: tuple[str, ...]) -> bool:
|
|
54
|
+
lowered = normalize_text(text).lower()
|
|
55
|
+
return any(token in lowered for token in host_tokens)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def looks_like_douyin_sec_uid(value: Optional[str]) -> bool:
|
|
59
|
+
return bool(_DOUYIN_SEC_UID_RE.fullmatch(normalize_text(value)))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def looks_like_xhs_note_id(value: Optional[str]) -> bool:
|
|
63
|
+
return bool(_XHS_NOTE_ID_RE.fullmatch(normalize_text(value)))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def looks_like_xhs_user_id(value: Optional[str]) -> bool:
|
|
67
|
+
return bool(_XHS_USER_ID_RE.fullmatch(normalize_text(value)))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def extract_douyin_share_url(text: Optional[str]) -> Optional[str]:
|
|
71
|
+
return _first_platform_url(text, _DOUYIN_HOST_TOKENS)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def extract_douyin_sec_uid(text: Optional[str]) -> Optional[str]:
|
|
75
|
+
raw = normalize_text(text)
|
|
76
|
+
if not raw:
|
|
77
|
+
return None
|
|
78
|
+
if looks_like_douyin_sec_uid(raw):
|
|
79
|
+
return raw
|
|
80
|
+
|
|
81
|
+
for url in extract_http_urls(raw):
|
|
82
|
+
query = parse_qs(urlparse(url).query)
|
|
83
|
+
for key in ("sec_uid", "sec_user_id"):
|
|
84
|
+
candidate = normalize_text((query.get(key) or [""])[0])
|
|
85
|
+
if looks_like_douyin_sec_uid(candidate):
|
|
86
|
+
return candidate
|
|
87
|
+
|
|
88
|
+
match = re.search(r"(?:sec_uid|sec_user_id)=([A-Za-z0-9._-]+)", raw)
|
|
89
|
+
if match:
|
|
90
|
+
candidate = normalize_text(match.group(1))
|
|
91
|
+
if looks_like_douyin_sec_uid(candidate):
|
|
92
|
+
return candidate
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def extract_xhs_note_id(text: Optional[str]) -> Optional[str]:
|
|
97
|
+
raw = normalize_text(text)
|
|
98
|
+
if not raw:
|
|
99
|
+
return None
|
|
100
|
+
if looks_like_xhs_note_id(raw):
|
|
101
|
+
return raw
|
|
102
|
+
|
|
103
|
+
for pattern in (
|
|
104
|
+
r"/explore/([0-9A-Za-z]+)",
|
|
105
|
+
r"/discovery/item/([0-9A-Za-z]+)",
|
|
106
|
+
r"note_id=([0-9A-Za-z]+)",
|
|
107
|
+
):
|
|
108
|
+
match = re.search(pattern, raw)
|
|
109
|
+
if not match:
|
|
110
|
+
continue
|
|
111
|
+
candidate = normalize_text(match.group(1))
|
|
112
|
+
if looks_like_xhs_note_id(candidate):
|
|
113
|
+
return candidate
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def extract_xhs_user_id(text: Optional[str]) -> Optional[str]:
|
|
118
|
+
raw = normalize_text(text)
|
|
119
|
+
if not raw:
|
|
120
|
+
return None
|
|
121
|
+
if looks_like_xhs_user_id(raw):
|
|
122
|
+
return raw
|
|
123
|
+
|
|
124
|
+
for pattern in (
|
|
125
|
+
r"/user/profile/([0-9A-Za-z]+)",
|
|
126
|
+
r"(?:user_id|userid)=([0-9A-Za-z]+)",
|
|
127
|
+
):
|
|
128
|
+
match = re.search(pattern, raw)
|
|
129
|
+
if not match:
|
|
130
|
+
continue
|
|
131
|
+
candidate = normalize_text(match.group(1))
|
|
132
|
+
if looks_like_xhs_user_id(candidate):
|
|
133
|
+
return candidate
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def extract_xhs_note_share_url(text: Optional[str]) -> Optional[str]:
|
|
138
|
+
for url in extract_http_urls(text):
|
|
139
|
+
parsed = urlparse(url)
|
|
140
|
+
host = parsed.netloc.lower()
|
|
141
|
+
path = parsed.path.lower()
|
|
142
|
+
query = parsed.query.lower()
|
|
143
|
+
if "xhslink.com" in host:
|
|
144
|
+
return url
|
|
145
|
+
if "xiaohongshu.com" not in host:
|
|
146
|
+
continue
|
|
147
|
+
if "/explore/" in path or "/discovery/item/" in path or "note_id=" in query:
|
|
148
|
+
return url
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def extract_xhs_creator_share_url(text: Optional[str]) -> Optional[str]:
|
|
153
|
+
for url in extract_http_urls(text):
|
|
154
|
+
parsed = urlparse(url)
|
|
155
|
+
host = parsed.netloc.lower()
|
|
156
|
+
path = parsed.path.lower()
|
|
157
|
+
query = parsed.query.lower()
|
|
158
|
+
if "xhslink.com" in host:
|
|
159
|
+
return url
|
|
160
|
+
if "xiaohongshu.com" not in host:
|
|
161
|
+
continue
|
|
162
|
+
if "/user/profile/" in path or "/user/" in path or "user_id=" in query or "userid=" in query:
|
|
163
|
+
return url
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def text_has_douyin_short_link(text: Optional[str]) -> bool:
|
|
168
|
+
return _text_contains_host(text, ("v.douyin.com",))
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def text_has_xhs_short_link(text: Optional[str]) -> bool:
|
|
172
|
+
return _text_contains_host(text, ("xhslink.com",))
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def normalize_douyin_work_input(input_value: Optional[str], share_url: Optional[str]) -> Dict[str, object]:
|
|
176
|
+
explicit_share = normalize_text(share_url)
|
|
177
|
+
raw_input = normalize_text(input_value)
|
|
178
|
+
normalized_share = extract_douyin_share_url(explicit_share) or extract_douyin_share_url(raw_input)
|
|
179
|
+
|
|
180
|
+
if normalized_share:
|
|
181
|
+
return {
|
|
182
|
+
"share_url": normalized_share,
|
|
183
|
+
"error_reason": None,
|
|
184
|
+
"missing_fields": [],
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if raw_input:
|
|
188
|
+
if text_has_douyin_short_link(raw_input):
|
|
189
|
+
return {
|
|
190
|
+
"share_url": None,
|
|
191
|
+
"error_reason": "short_link_detected_but_unresolved",
|
|
192
|
+
"missing_fields": [{"field": "share_url", "reason": "short_link_unresolved"}],
|
|
193
|
+
}
|
|
194
|
+
return {
|
|
195
|
+
"share_url": None,
|
|
196
|
+
"error_reason": "invalid_share_url",
|
|
197
|
+
"missing_fields": [{"field": "share_url", "reason": "invalid_input"}],
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"share_url": None,
|
|
202
|
+
"error_reason": None,
|
|
203
|
+
"missing_fields": [],
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def normalize_xhs_note_input(
|
|
208
|
+
input_value: Optional[str],
|
|
209
|
+
share_text: Optional[str],
|
|
210
|
+
note_id: Optional[str],
|
|
211
|
+
) -> Dict[str, object]:
|
|
212
|
+
explicit_note_id = normalize_text(note_id)
|
|
213
|
+
if explicit_note_id and not looks_like_xhs_note_id(explicit_note_id):
|
|
214
|
+
return {
|
|
215
|
+
"share_text": None,
|
|
216
|
+
"note_id": None,
|
|
217
|
+
"error_reason": "invalid_note_id",
|
|
218
|
+
"missing_fields": [{"field": "note_id", "reason": "invalid_format"}],
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
explicit_share = normalize_text(share_text)
|
|
222
|
+
raw_input = normalize_text(input_value)
|
|
223
|
+
|
|
224
|
+
normalized_note_id = explicit_note_id or extract_xhs_note_id(explicit_share) or extract_xhs_note_id(raw_input)
|
|
225
|
+
normalized_share = extract_xhs_note_share_url(explicit_share) or extract_xhs_note_share_url(raw_input)
|
|
226
|
+
|
|
227
|
+
if normalized_note_id or normalized_share:
|
|
228
|
+
return {
|
|
229
|
+
"share_text": normalized_share or None,
|
|
230
|
+
"note_id": normalized_note_id or None,
|
|
231
|
+
"error_reason": None,
|
|
232
|
+
"missing_fields": [],
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
candidate = explicit_share or raw_input
|
|
236
|
+
if candidate:
|
|
237
|
+
if text_has_xhs_short_link(candidate):
|
|
238
|
+
return {
|
|
239
|
+
"share_text": None,
|
|
240
|
+
"note_id": None,
|
|
241
|
+
"error_reason": "short_link_detected_but_unresolved",
|
|
242
|
+
"missing_fields": [{"field": "note_id", "reason": "short_link_unresolved"}],
|
|
243
|
+
}
|
|
244
|
+
return {
|
|
245
|
+
"share_text": None,
|
|
246
|
+
"note_id": None,
|
|
247
|
+
"error_reason": "invalid_note_id",
|
|
248
|
+
"missing_fields": [{"field": "note_id", "reason": "invalid_format"}],
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return {
|
|
252
|
+
"share_text": None,
|
|
253
|
+
"note_id": None,
|
|
254
|
+
"error_reason": None,
|
|
255
|
+
"missing_fields": [],
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def normalize_douyin_creator_input(input_value: Optional[str]) -> Dict[str, object]:
|
|
260
|
+
raw_input = normalize_text(input_value)
|
|
261
|
+
normalized_input = extract_douyin_sec_uid(raw_input) or extract_douyin_share_url(raw_input) or raw_input or None
|
|
262
|
+
|
|
263
|
+
if extract_douyin_sec_uid(raw_input) or extract_douyin_share_url(raw_input):
|
|
264
|
+
return {
|
|
265
|
+
"input_value": normalized_input,
|
|
266
|
+
"error_reason": None,
|
|
267
|
+
"missing_fields": [],
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if raw_input:
|
|
271
|
+
if text_has_douyin_short_link(raw_input):
|
|
272
|
+
return {
|
|
273
|
+
"input_value": None,
|
|
274
|
+
"error_reason": "short_link_detected_but_unresolved",
|
|
275
|
+
"missing_fields": [{"field": "platform_author_id", "reason": "short_link_unresolved"}],
|
|
276
|
+
}
|
|
277
|
+
return {
|
|
278
|
+
"input_value": None,
|
|
279
|
+
"error_reason": "invalid_creator_input",
|
|
280
|
+
"missing_fields": [{"field": "platform_author_id", "reason": "invalid_format"}],
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
"input_value": None,
|
|
285
|
+
"error_reason": None,
|
|
286
|
+
"missing_fields": [],
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def normalize_xhs_creator_input(input_value: Optional[str]) -> Dict[str, object]:
|
|
291
|
+
raw_input = normalize_text(input_value)
|
|
292
|
+
normalized_input = extract_xhs_user_id(raw_input) or extract_xhs_creator_share_url(raw_input) or None
|
|
293
|
+
|
|
294
|
+
if normalized_input:
|
|
295
|
+
return {
|
|
296
|
+
"input_value": normalized_input,
|
|
297
|
+
"error_reason": None,
|
|
298
|
+
"missing_fields": [],
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
if raw_input:
|
|
302
|
+
if text_has_xhs_short_link(raw_input):
|
|
303
|
+
return {
|
|
304
|
+
"input_value": None,
|
|
305
|
+
"error_reason": "short_link_detected_but_unresolved",
|
|
306
|
+
"missing_fields": [{"field": "platform_author_id", "reason": "short_link_unresolved"}],
|
|
307
|
+
}
|
|
308
|
+
return {
|
|
309
|
+
"input_value": None,
|
|
310
|
+
"error_reason": "invalid_creator_input",
|
|
311
|
+
"missing_fields": [{"field": "platform_author_id", "reason": "invalid_format"}],
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return {
|
|
315
|
+
"input_value": None,
|
|
316
|
+
"error_reason": None,
|
|
317
|
+
"missing_fields": [],
|
|
318
|
+
}
|