@tikomni/skills 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,9 +28,14 @@ from scripts.core.progress_report import build_progress_reporter
28
28
  from scripts.core.storage_router import resolve_author_directory_name
29
29
  from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
30
30
  from scripts.pipelines.home_asr import enrich_author_home_asr
31
+ from scripts.pipelines.input_contracts import normalize_xhs_creator_input
32
+ from scripts.pipelines.schema import build_author_profile
31
33
  from scripts.pipelines.xiaohongshu_creator_home_helpers import collect_and_adapt
32
34
  from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
33
35
 
36
+ DEFAULT_MAX_ITEMS = 200
37
+ MAX_ITEMS_HARD_LIMIT = 200
38
+
34
39
 
35
40
  def _write_collection_artifacts(
36
41
  *,
@@ -81,11 +86,12 @@ def run_xiaohongshu_creator_home(
81
86
  *,
82
87
  input_value: str,
83
88
  config: Dict[str, Any],
84
- runtime: Dict[str, Any],
89
+ runtime: Dict[str, Any] | None,
85
90
  max_items: int,
86
91
  write_card: bool,
87
92
  persist_output: bool,
88
93
  ) -> Dict[str, Any]:
94
+ bounded_max_items = max(1, min(int(max_items), MAX_ITEMS_HARD_LIMIT))
89
95
  progress = build_progress_reporter(
90
96
  workflow="social-media-crawl",
91
97
  platform="xiaohongshu",
@@ -94,15 +100,69 @@ def run_xiaohongshu_creator_home(
94
100
  scope="workflow",
95
101
  )
96
102
  progress.started(stage="author_home.workflow", message="xiaohongshu author_home workflow started")
103
+ preflight = normalize_xhs_creator_input(input_value)
104
+ normalized_input_value = str(preflight.get("input_value") or "")
105
+ if preflight.get("error_reason"):
106
+ request_id = ensure_request_id(None, fallback_seed=input_value)
107
+ empty_profile = build_author_profile(platform="xiaohongshu", request_id=request_id)
108
+ extract_trace = [
109
+ {
110
+ "step": "input.preflight",
111
+ "ok": False,
112
+ "input_kind": "creator_url_or_user_id",
113
+ "normalized_input_value": normalized_input_value or None,
114
+ "error_reason": preflight.get("error_reason"),
115
+ "missing_fields": list(preflight.get("missing_fields") or []),
116
+ }
117
+ ]
118
+ envelope = {
119
+ "object_type": "creator",
120
+ "platform": "xiaohongshu",
121
+ "input": input_value,
122
+ "normalized": {
123
+ "creator_profile": {**empty_profile, "request_id": request_id, "extract_trace": extract_trace},
124
+ "work_collection": {
125
+ "platform": "xiaohongshu",
126
+ "platform_author_id": "",
127
+ "count": 0,
128
+ "items": [],
129
+ "request_id": request_id,
130
+ "extract_trace": extract_trace,
131
+ },
132
+ },
133
+ "completeness": evaluate_collection(empty_profile, []),
134
+ "missing_fields": normalize_missing_fields(preflight.get("missing_fields")),
135
+ "error_reason": str(preflight.get("error_reason") or "invalid_creator_input"),
136
+ "extract_trace": extract_trace,
137
+ "request_id": request_id,
138
+ "card_write": {
139
+ "enabled": bool(write_card),
140
+ "ok": False,
141
+ "count": 0,
142
+ "results": [],
143
+ "reason": "skipped_invalid_input",
144
+ },
145
+ "collection_artifacts": {},
146
+ "output_persist": {"enabled": False, "skipped": True, "reason": "invalid_input"},
147
+ }
148
+ progress.done(
149
+ stage="author_home.workflow",
150
+ message="xiaohongshu author_home workflow finished",
151
+ data={"request_id": request_id, "works_count": 0, "error_reason": envelope["error_reason"]},
152
+ )
153
+ return envelope
154
+
155
+ if runtime is None:
156
+ raise ValueError("runtime_required_for_valid_input")
97
157
 
98
158
  raw, profile, works, missing = collect_and_adapt(
99
- input_value=input_value,
159
+ input_value=normalized_input_value or input_value,
100
160
  base_url=runtime["base_url"],
101
161
  token=runtime["token"],
102
162
  timeout_ms=runtime["timeout_ms"],
103
163
  page_size=20,
104
164
  pages_max=50,
105
- max_items=max(1, int(max_items)),
165
+ max_items=bounded_max_items,
106
166
  progress=progress.child(scope="author_home.collect"),
107
167
  )
108
168
 
@@ -138,7 +198,7 @@ def run_xiaohongshu_creator_home(
138
198
 
139
199
  request_id = ensure_request_id(
140
200
  raw.get("request_id") or profile.get("request_id"),
141
- fallback_seed=input_value,
201
+ fallback_seed=normalized_input_value or input_value,
142
202
  )
143
203
  extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
144
204
 
@@ -206,7 +266,12 @@ def main() -> None:
206
266
  parser.add_argument("--allow-process-env", action="store_true", help="Allow process env overrides")
207
267
  parser.add_argument("--base-url", default=None, help="Override Tikomni base URL")
208
268
  parser.add_argument("--timeout-ms", type=int, default=None, help="Override timeout in ms")
209
- parser.add_argument("--max-items", type=int, default=5, help="Max works to collect from homepage")
269
+ parser.add_argument(
270
+ "--max-items",
271
+ type=int,
272
+ default=DEFAULT_MAX_ITEMS,
273
+ help=f"Max works to collect from homepage (default full crawl, capped at {MAX_ITEMS_HARD_LIMIT})",
274
+ )
210
275
  parser.set_defaults(write_card=True, persist_output=True)
211
276
  parser.add_argument("--write-card", dest="write_card", action="store_true", help="Write work fact cards")
212
277
  parser.add_argument("--no-write-card", dest="write_card", action="store_false", help="Skip card writing")
@@ -215,6 +280,19 @@ def main() -> None:
215
280
  args = parser.parse_args()
216
281
 
217
282
  config, _ = load_tikomni_config(args.config, env_file=args.env_file, allow_process_env=args.allow_process_env)
283
+ preflight = normalize_xhs_creator_input(args.input)
284
+ if preflight.get("error_reason"):
285
+ write_json_stdout(
286
+ run_xiaohongshu_creator_home(
287
+ input_value=args.input,
288
+ config=config,
289
+ runtime=None,
290
+ max_items=int(args.max_items),
291
+ write_card=bool(args.write_card),
292
+ persist_output=bool(args.persist_output),
293
+ )
294
+ )
295
+ return
218
296
  runtime = resolve_runtime(
219
297
  env_file=args.env_file,
220
298
  api_key_env=str(config_get(config, "runtime.auth_env_key", "TIKOMNI_API_KEY")),
@@ -224,7 +302,7 @@ def main() -> None:
224
302
  )
225
303
  write_json_stdout(
226
304
  run_xiaohongshu_creator_home(
227
- input_value=args.input,
305
+ input_value=str(preflight.get("input_value") or args.input),
228
306
  config=config,
229
307
  runtime=runtime,
230
308
  max_items=int(args.max_items),
@@ -40,6 +40,12 @@ from scripts.core.tikomni_common import (
40
40
  summarize_content,
41
41
  write_json_stdout,
42
42
  )
43
+ from scripts.pipelines.input_contracts import (
44
+ extract_xhs_note_id as extract_shared_xhs_note_id,
45
+ normalize_xhs_note_input,
46
+ text_has_xhs_short_link,
47
+ )
48
+ from scripts.pipelines.media_url_rules import filter_video_urls, is_probable_video_url
43
49
  from scripts.writers.write_work_fact_card import (
44
50
  build_work_output_envelope,
45
51
  persist_output_envelope,
@@ -194,36 +200,15 @@ def _finalize_result(
194
200
 
195
201
 
196
202
  def _normalize_input(input_value: Optional[str], share_text: Optional[str], note_id: Optional[str]) -> Dict[str, Optional[str]]:
197
- normalized_share = normalize_text(share_text) or None
198
- normalized_note_id = normalize_text(note_id) or None
199
-
200
- if input_value and not normalized_share and not normalized_note_id:
201
- candidate = input_value.strip()
202
- if candidate.startswith("http://") or candidate.startswith("https://"):
203
- normalized_share = candidate
204
- else:
205
- normalized_note_id = candidate
206
-
203
+ normalized = normalize_xhs_note_input(input_value, share_text, note_id)
207
204
  return {
208
- "share_text": normalized_share,
209
- "note_id": normalized_note_id,
205
+ "share_text": normalize_text(normalized.get("share_text")) or None,
206
+ "note_id": normalize_text(normalized.get("note_id")) or None,
210
207
  }
211
208
 
212
209
 
213
210
  def _extract_note_id_from_share(share_text: Optional[str]) -> Optional[str]:
214
- if not share_text:
215
- return None
216
- text = share_text.strip()
217
- patterns = [
218
- r"/explore/([0-9a-zA-Z]+)",
219
- r"/discovery/item/([0-9a-zA-Z]+)",
220
- r"note_id=([0-9a-zA-Z]+)",
221
- ]
222
- for pattern in patterns:
223
- match = re.search(pattern, text)
224
- if match:
225
- return match.group(1)
226
- return None
211
+ return extract_shared_xhs_note_id(share_text)
227
212
 
228
213
 
229
214
  def _resolve_note_id(payload: Any, source_input: Dict[str, Optional[str]]) -> Optional[str]:
@@ -256,13 +241,7 @@ def _resolve_note_id(payload: Any, source_input: Dict[str, Optional[str]]) -> Op
256
241
 
257
242
 
258
243
  def _is_short_share_url(share_text: Optional[str]) -> bool:
259
- if not share_text:
260
- return False
261
- try:
262
- host = urllib.parse.urlparse(share_text).netloc.lower()
263
- except Exception:
264
- return False
265
- return "xhslink.com" in host
244
+ return text_has_xhs_short_link(share_text)
266
245
 
267
246
 
268
247
  def _app_response_has_core_fields(response_data: Any) -> bool:
@@ -609,17 +588,19 @@ def _extract_xhs_metadata(
609
588
  if not cover_image and selected_image_urls:
610
589
  cover_image = selected_image_urls[0]
611
590
 
612
- video_down_url = _pick_text_from_paths(
613
- payload,
614
- [
591
+ video_down_url_candidates = [
592
+ _pick_text_from_paths(payload, [path])
593
+ for path in [
615
594
  ["video_down_url"],
616
595
  ["original_video_url"],
617
596
  ["video_url"],
618
597
  ["play_url"],
619
598
  ["master_url"],
620
599
  ["selected_video_url"],
621
- ],
622
- )
600
+ ]
601
+ ]
602
+ filtered_video_down_urls = filter_video_urls(video_down_url_candidates)
603
+ video_down_url = filtered_video_down_urls[0] if filtered_video_down_urls else ""
623
604
  if not video_down_url:
624
605
  video_down_url = normalize_text(selected_video_url)
625
606
 
@@ -1091,24 +1072,9 @@ def _url_likely_image(url: str) -> bool:
1091
1072
 
1092
1073
 
1093
1074
  def _url_likely_video(url: str) -> bool:
1094
- lower = url.lower()
1095
- video_tokens = [
1096
- ".mp4",
1097
- ".m3u8",
1098
- ".m4a",
1099
- ".mp3",
1100
- "video",
1101
- "play",
1102
- "stream",
1103
- "master",
1104
- "sns-video",
1105
- "redvideo",
1106
- "vod",
1107
- "/audio/",
1108
- ]
1109
1075
  if _url_likely_image(url):
1110
1076
  return False
1111
- return any(token in lower for token in video_tokens)
1077
+ return is_probable_video_url(url)
1112
1078
 
1113
1079
 
1114
1080
  def _video_quality_hint(url: str) -> int:
@@ -1174,7 +1140,7 @@ def _extract_video_candidates(payload: Any) -> List[str]:
1174
1140
  unique.append(url)
1175
1141
  seen.add(url)
1176
1142
 
1177
- video_only = [u for u in unique if _url_likely_video(u)]
1143
+ video_only = filter_video_urls([u for u in unique if _url_likely_video(u)])
1178
1144
  if not video_only:
1179
1145
  return []
1180
1146
 
@@ -1299,10 +1265,7 @@ def _detect_note_content_type(payload: Any, video_candidates: List[str], image_c
1299
1265
  if "image" in note_type_value:
1300
1266
  return "image"
1301
1267
 
1302
- note_sound_url = normalize_text(deep_find_first(payload, ["note_sound_info", "url"])).lower()
1303
- has_note_audio = bool(note_sound_url and any(token in note_sound_url for token in [".m4a", ".mp3", "/audio/"]))
1304
-
1305
- has_video = bool(video_candidates) or has_note_audio
1268
+ has_video = bool(video_candidates)
1306
1269
  has_image = bool(image_candidates)
1307
1270
  if has_video and has_image:
1308
1271
  return "mixed"
@@ -1494,7 +1457,11 @@ def run_xiaohongshu_extract(
1494
1457
  workflow_started_at = time.perf_counter()
1495
1458
  timings = _empty_timings()
1496
1459
  parse_started_at = time.perf_counter()
1497
- source_input = _normalize_input(input_value, share_text, note_id)
1460
+ preflight = normalize_xhs_note_input(input_value, share_text, note_id)
1461
+ source_input = {
1462
+ "share_text": normalize_text(preflight.get("share_text")) or None,
1463
+ "note_id": normalize_text(preflight.get("note_id")) or None,
1464
+ }
1498
1465
  timings["url_parse_ms"] = _elapsed_ms(parse_started_at)
1499
1466
  if progress is not None:
1500
1467
  progress.started(
@@ -1503,13 +1470,72 @@ def run_xiaohongshu_extract(
1503
1470
  data={"analysis_mode": analysis_mode, "write_card": bool(write_card), "persist_output": bool(persist_output)},
1504
1471
  )
1505
1472
  metadata_fields: Dict[str, Any] = {}
1473
+ preflight_trace = [
1474
+ {
1475
+ "step": "input.preflight",
1476
+ "ok": preflight.get("error_reason") is None,
1477
+ "input_kind": "share_text_or_note_id",
1478
+ "normalized_share_text": source_input.get("share_text"),
1479
+ "normalized_note_id": source_input.get("note_id"),
1480
+ "error_reason": preflight.get("error_reason"),
1481
+ "missing_fields": list(preflight.get("missing_fields") or []),
1482
+ }
1483
+ ]
1484
+ if preflight.get("error_reason"):
1485
+ result = _build_result(
1486
+ source_input=source_input,
1487
+ raw_content="",
1488
+ confidence="low",
1489
+ error_reason=str(preflight.get("error_reason") or "invalid_note_id"),
1490
+ extract_trace=preflight_trace,
1491
+ fallback_trace=[],
1492
+ request_id=None,
1493
+ text_source="none",
1494
+ note_id=None,
1495
+ subtitle_hit=False,
1496
+ u2_task_id=None,
1497
+ u2_task_status="UNKNOWN",
1498
+ note_content_type="unknown",
1499
+ analysis_mode=analysis_mode,
1500
+ selected_video_url=None,
1501
+ selected_video_candidates=[],
1502
+ selected_image_urls=[],
1503
+ downloaded_assets=[],
1504
+ missing_fields=list(preflight.get("missing_fields") or []),
1505
+ metadata_fields=metadata_fields,
1506
+ timings=timings,
1507
+ )
1508
+ if write_card:
1509
+ card_started_at = time.perf_counter()
1510
+ result["card_write"] = write_work_fact_card(
1511
+ payload=result,
1512
+ platform="xiaohongshu",
1513
+ card_type=card_type,
1514
+ card_root=card_root,
1515
+ content_kind="note",
1516
+ storage_config=storage_config,
1517
+ analysis_mode=analysis_mode,
1518
+ progress=progress.child(scope="card_write") if progress is not None else None,
1519
+ )
1520
+ timings["card_write_ms"] = _elapsed_ms(card_started_at)
1521
+ timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
1522
+ timings["total_ms"] = _elapsed_ms(workflow_started_at)
1523
+ result["timings"] = dict(timings)
1524
+ _update_pipeline_status(result)
1525
+ return _finalize_result(
1526
+ result=result,
1527
+ source_input=source_input,
1528
+ note_id=None,
1529
+ storage_config=storage_config,
1530
+ persist_output=persist_output,
1531
+ )
1506
1532
  if not source_input["share_text"] and not source_input["note_id"]:
1507
1533
  result = _build_result(
1508
1534
  source_input=source_input,
1509
1535
  raw_content="",
1510
1536
  confidence="low",
1511
1537
  error_reason="missing_share_text_or_note_id",
1512
- extract_trace=[],
1538
+ extract_trace=preflight_trace,
1513
1539
  fallback_trace=[],
1514
1540
  request_id=None,
1515
1541
  text_source="none",
@@ -215,10 +215,10 @@ def _resolve_primary_text(payload: Dict[str, Any], caption_raw: str) -> Dict[str
215
215
  subtitle_raw = _safe_text(payload.get("subtitle_raw"))
216
216
  asr_clean = _safe_text(payload.get("asr_clean"))
217
217
  asr_raw = _safe_text(payload.get("asr_raw"))
218
- if subtitle_raw:
219
- return {"primary_text": subtitle_raw, "primary_text_source": "subtitle_raw"}
220
218
  if asr_clean:
221
219
  return {"primary_text": asr_clean, "primary_text_source": "asr_clean"}
220
+ if subtitle_raw:
221
+ return {"primary_text": subtitle_raw, "primary_text_source": "subtitle_raw"}
222
222
  if asr_raw:
223
223
  return {"primary_text": asr_raw, "primary_text_source": "asr_raw"}
224
224
  if caption_raw:
@@ -356,7 +356,9 @@ def _markdown_lines(card: Dict[str, Any]) -> List[str]:
356
356
  lines.extend(["", "## 主文本", primary_text or ""])
357
357
  if caption_raw and caption_raw != primary_text:
358
358
  lines.extend(["", "## 原始文案", caption_raw])
359
- if subtitle_raw and subtitle_raw != primary_text:
359
+ if asr_raw and subtitle_raw and asr_raw == subtitle_raw and asr_raw != primary_text:
360
+ lines.extend(["", "## 原始转写", asr_raw])
361
+ elif subtitle_raw and subtitle_raw != primary_text:
360
362
  lines.extend(["", "## 原始字幕", subtitle_raw])
361
363
  if asr_raw and asr_raw not in {primary_text, subtitle_raw}:
362
364
  lines.extend(["", "## 原始转写", asr_raw])