@tikomni/skills 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/creator-analysis/SKILL.md +34 -10
- package/skills/creator-analysis/references/contracts/creator-card-fields.md +2 -0
- package/skills/creator-analysis/references/contracts/work-card-fields.md +40 -4
- package/skills/creator-analysis/references/platform-guides/douyin.md +41 -36
- package/skills/creator-analysis/references/platform-guides/generic.md +11 -7
- package/skills/creator-analysis/references/platform-guides/xiaohongshu.md +45 -30
- package/skills/creator-analysis/references/schemas/author-analysis-v2.schema.json +224 -95
- package/skills/creator-analysis/references/workflow.md +8 -3
- package/skills/creator-analysis/scripts/author_home/adapters/platform_adapters.py +205 -21
- package/skills/creator-analysis/scripts/author_home/analyzers/author_analysis_v2_support.py +54 -11
- package/skills/creator-analysis/scripts/author_home/analyzers/prompt_first_analyzers.py +200 -13
- package/skills/creator-analysis/scripts/author_home/analyzers/sampled_work_batch_explainer.py +113 -42
- package/skills/creator-analysis/scripts/author_home/asr/home_asr.py +65 -7
- package/skills/creator-analysis/scripts/author_home/builders/home_builders.py +82 -18
- package/skills/creator-analysis/scripts/author_home/collectors/homepage_collectors.py +198 -32
- package/skills/creator-analysis/scripts/author_home/orchestrator/run_author_analysis.py +374 -31
- package/skills/creator-analysis/scripts/author_home/orchestrator/work_analysis_artifacts.py +68 -12
- package/skills/creator-analysis/scripts/core/storage_router.py +3 -0
- package/skills/creator-analysis/scripts/writers/write_author_homepage_samples.py +3 -2
- package/skills/creator-analysis/scripts/writers/write_benchmark_card.py +314 -137
|
@@ -58,6 +58,46 @@ def _pick_http_urls(payload: Any, keys: List[str]) -> List[str]:
|
|
|
58
58
|
return deduped
|
|
59
59
|
|
|
60
60
|
|
|
61
|
+
def _extract_first_url(value: Any) -> str:
|
|
62
|
+
if isinstance(value, str):
|
|
63
|
+
text = value.strip()
|
|
64
|
+
return text if text.startswith("http://") or text.startswith("https://") else ""
|
|
65
|
+
if isinstance(value, list):
|
|
66
|
+
for item in value:
|
|
67
|
+
url = _extract_first_url(item)
|
|
68
|
+
if url:
|
|
69
|
+
return url
|
|
70
|
+
return ""
|
|
71
|
+
if isinstance(value, dict):
|
|
72
|
+
for key in ("url_list", "url", "uri", "avatar_url", "cover_url", "src"):
|
|
73
|
+
if key in value:
|
|
74
|
+
url = _extract_first_url(value.get(key))
|
|
75
|
+
if url:
|
|
76
|
+
return url
|
|
77
|
+
return ""
|
|
78
|
+
return ""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalize_douyin_tags(value: Any) -> List[str]:
|
|
82
|
+
if not isinstance(value, list):
|
|
83
|
+
return []
|
|
84
|
+
tags: List[str] = []
|
|
85
|
+
for item in value:
|
|
86
|
+
if isinstance(item, str):
|
|
87
|
+
text = item.strip().lstrip("#")
|
|
88
|
+
if text:
|
|
89
|
+
tags.append(text)
|
|
90
|
+
continue
|
|
91
|
+
if not isinstance(item, dict):
|
|
92
|
+
continue
|
|
93
|
+
for key in ("hashtag_name", "search_text", "tag_name", "name", "text"):
|
|
94
|
+
text = _t(item.get(key)).lstrip("#")
|
|
95
|
+
if text:
|
|
96
|
+
tags.append(text)
|
|
97
|
+
break
|
|
98
|
+
return list(dict.fromkeys(tags))
|
|
99
|
+
|
|
100
|
+
|
|
61
101
|
def _is_probable_video_url(url: str) -> bool:
|
|
62
102
|
lower = (url or "").lower()
|
|
63
103
|
if not (lower.startswith("http://") or lower.startswith("https://")):
|
|
@@ -93,13 +133,54 @@ def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
|
|
|
93
133
|
return ""
|
|
94
134
|
|
|
95
135
|
|
|
136
|
+
def _normalize_text_list(value: Any) -> List[str]:
|
|
137
|
+
values: List[str] = []
|
|
138
|
+
if isinstance(value, list):
|
|
139
|
+
items = value
|
|
140
|
+
else:
|
|
141
|
+
items = [value]
|
|
142
|
+
for item in items:
|
|
143
|
+
if isinstance(item, str):
|
|
144
|
+
text = item.strip().lstrip("#")
|
|
145
|
+
if text:
|
|
146
|
+
values.append(text)
|
|
147
|
+
continue
|
|
148
|
+
if not isinstance(item, dict):
|
|
149
|
+
continue
|
|
150
|
+
for key in ("name", "tag_name", "tag", "text", "display_text", "title"):
|
|
151
|
+
text = _t(item.get(key)).lstrip("#")
|
|
152
|
+
if text:
|
|
153
|
+
values.append(text)
|
|
154
|
+
break
|
|
155
|
+
return list(dict.fromkeys(values))
|
|
156
|
+
|
|
157
|
+
|
|
96
158
|
def _extract_xhs_subtitle_inline(item: Dict[str, Any]) -> str:
|
|
97
159
|
lines: List[str] = []
|
|
98
|
-
for container in deep_find_all(
|
|
160
|
+
for container in deep_find_all(
|
|
161
|
+
item,
|
|
162
|
+
[
|
|
163
|
+
"subtitles",
|
|
164
|
+
"subtitle_list",
|
|
165
|
+
"subtitleList",
|
|
166
|
+
"subtitle",
|
|
167
|
+
"subtitle_text",
|
|
168
|
+
"caption_text",
|
|
169
|
+
"transcript",
|
|
170
|
+
"transcript_text",
|
|
171
|
+
"subtitle_content",
|
|
172
|
+
"subtitle_inline",
|
|
173
|
+
],
|
|
174
|
+
):
|
|
175
|
+
if isinstance(container, str):
|
|
176
|
+
value = _t(container)
|
|
177
|
+
if value:
|
|
178
|
+
lines.append(value)
|
|
179
|
+
continue
|
|
99
180
|
if isinstance(container, list):
|
|
100
181
|
for entry in container:
|
|
101
182
|
if isinstance(entry, dict):
|
|
102
|
-
for key in ["text", "content", "sentence", "line"]:
|
|
183
|
+
for key in ["text", "content", "sentence", "line", "subtitle_text", "caption_text"]:
|
|
103
184
|
value = _t(entry.get(key))
|
|
104
185
|
if value:
|
|
105
186
|
lines.append(value)
|
|
@@ -118,7 +199,21 @@ def _extract_xhs_subtitle_inline(item: Dict[str, Any]) -> str:
|
|
|
118
199
|
|
|
119
200
|
|
|
120
201
|
def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
|
|
121
|
-
return _pick_http_urls(
|
|
202
|
+
return _pick_http_urls(
|
|
203
|
+
item,
|
|
204
|
+
[
|
|
205
|
+
"subtitle_url",
|
|
206
|
+
"subtitleUrl",
|
|
207
|
+
"srt_url",
|
|
208
|
+
"srtUrl",
|
|
209
|
+
"vtt_url",
|
|
210
|
+
"vttUrl",
|
|
211
|
+
"caption_url",
|
|
212
|
+
"captionUrl",
|
|
213
|
+
"subtitle_urls",
|
|
214
|
+
"subtitleUrls",
|
|
215
|
+
],
|
|
216
|
+
)
|
|
122
217
|
|
|
123
218
|
|
|
124
219
|
def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str, subtitle_inline: str) -> str:
|
|
@@ -132,6 +227,66 @@ def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str,
|
|
|
132
227
|
return "text"
|
|
133
228
|
|
|
134
229
|
|
|
230
|
+
def _extract_xhs_avatar_url(payload: Any) -> str:
|
|
231
|
+
return (
|
|
232
|
+
_extract_first_url(_first(payload, ["image"], ""))
|
|
233
|
+
or _extract_first_url(_first(payload, ["avatar"], ""))
|
|
234
|
+
or _extract_first_url(_first(payload, ["avatar_url"], ""))
|
|
235
|
+
or _extract_first_url(_first(payload, ["images"], ""))
|
|
236
|
+
or _extract_first_url(_first(payload, ["avatar_info"], ""))
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _extract_xhs_cover_image(item: Dict[str, Any]) -> str:
|
|
241
|
+
return (
|
|
242
|
+
_extract_first_url(_first(item, ["cover"], ""))
|
|
243
|
+
or _extract_first_url(_first(item, ["cover_url"], ""))
|
|
244
|
+
or _extract_first_url(_first(item, ["cover_image"], ""))
|
|
245
|
+
or _extract_first_url(_first(item, ["image"], ""))
|
|
246
|
+
or _extract_first_url(_first(item, ["image_url"], ""))
|
|
247
|
+
or _extract_first_url(_first(item, ["images"], ""))
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _extract_xhs_share_url(item: Dict[str, Any], note_id: str) -> str:
|
|
252
|
+
return (
|
|
253
|
+
_t(_first(item, ["share_url", "share_link", "url", "note_url", "short_url"]))
|
|
254
|
+
or (f"https://www.xiaohongshu.com/explore/{note_id}" if note_id else "")
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _extract_xhs_source_url(item: Dict[str, Any], note_id: str) -> str:
|
|
259
|
+
return (
|
|
260
|
+
_t(_first(item, ["source_url", "note_url", "url", "share_url", "share_link"]))
|
|
261
|
+
or (f"https://www.xiaohongshu.com/explore/{note_id}" if note_id else "")
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _extract_xhs_title(item: Dict[str, Any]) -> str:
|
|
266
|
+
return _t(_first(item, ["title", "display_title", "note_title", "name"]))
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _extract_xhs_caption(item: Dict[str, Any]) -> str:
|
|
270
|
+
return _t(_first(item, ["desc", "content", "note_desc", "description", "text"]))
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _extract_xhs_tags(item: Dict[str, Any]) -> List[str]:
|
|
274
|
+
for key in ("tag_list", "tags", "hashtags", "topics"):
|
|
275
|
+
value = _first(item, [key], [])
|
|
276
|
+
tags = _normalize_text_list(value)
|
|
277
|
+
if tags:
|
|
278
|
+
return tags
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _extract_xhs_profile_payload(raw: Dict[str, Any]) -> Any:
|
|
283
|
+
profile_response = raw.get("profile_response") if isinstance(raw.get("profile_response"), dict) else {}
|
|
284
|
+
profile_data = profile_response.get("data")
|
|
285
|
+
if isinstance(profile_data, dict):
|
|
286
|
+
return profile_data
|
|
287
|
+
return profile_response
|
|
288
|
+
|
|
289
|
+
|
|
135
290
|
def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dict[str, Any]], List[Dict[str, str]]]:
|
|
136
291
|
missing: List[Dict[str, str]] = []
|
|
137
292
|
profile_data = raw.get("profile_response", {}).get("data")
|
|
@@ -151,7 +306,11 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
151
306
|
liked_count=_i(_first(profile_data, ["total_favorited", "liked_count", "favoriting_count"])),
|
|
152
307
|
collected_count=_i(_first(profile_data, ["collect_count", "collected_count", "total_collected_count"])),
|
|
153
308
|
signature=_t(_first(profile_data, ["signature", "desc"])),
|
|
154
|
-
avatar_url=
|
|
309
|
+
avatar_url=(
|
|
310
|
+
_extract_first_url(_first(profile_data, ["avatar_larger"], ""))
|
|
311
|
+
or _extract_first_url(_first(profile_data, ["avatar_thumb"], ""))
|
|
312
|
+
or _extract_first_url(_first(profile_data, ["avatar_url", "avatar"], ""))
|
|
313
|
+
),
|
|
155
314
|
works_count=_i(_first(profile_data, ["aweme_count", "works_count", "video_count"])),
|
|
156
315
|
verified=bool(_first(profile_data, ["verification_type", "verified"], 0) not in (0, None, "", "false", False)),
|
|
157
316
|
snapshot_at=datetime.now().isoformat(timespec="seconds"),
|
|
@@ -177,6 +336,7 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
177
336
|
"play": _i(_first(item, ["play_count", "view_count"], 0)),
|
|
178
337
|
}
|
|
179
338
|
video_down_url = _extract_douyin_video_down_url(item)
|
|
339
|
+
tags = _normalize_douyin_tags(_first(item, ["hashtags", "tags", "text_extra"], []))
|
|
180
340
|
work = build_work_item(
|
|
181
341
|
platform="douyin",
|
|
182
342
|
platform_work_id=aweme_id,
|
|
@@ -191,9 +351,13 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
191
351
|
work_modality="video",
|
|
192
352
|
content_type="video",
|
|
193
353
|
duration_ms=_i(_first(item, ["duration_ms", "duration"], 0)),
|
|
194
|
-
tags=
|
|
354
|
+
tags=tags,
|
|
195
355
|
metrics=metrics,
|
|
196
|
-
cover_image=
|
|
356
|
+
cover_image=(
|
|
357
|
+
_extract_first_url(_first(item, ["cover_url"], ""))
|
|
358
|
+
or _extract_first_url(_first(item, ["cover"], ""))
|
|
359
|
+
or _extract_first_url(_first(item, ["origin_cover"], ""))
|
|
360
|
+
),
|
|
197
361
|
source_url=f"https://www.douyin.com/video/{aweme_id}" if aweme_id else "",
|
|
198
362
|
share_url=_t(_first(item, ["share_url", "share_link"])),
|
|
199
363
|
video_download_url=video_down_url,
|
|
@@ -206,6 +370,15 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
206
370
|
},
|
|
207
371
|
raw_ref={"aweme_id": aweme_id, "raw_item": item},
|
|
208
372
|
)
|
|
373
|
+
work.update(
|
|
374
|
+
{
|
|
375
|
+
"digg_count": metrics["like"],
|
|
376
|
+
"comment_count": metrics["comment"],
|
|
377
|
+
"collect_count": metrics["collect"],
|
|
378
|
+
"share_count": metrics["share"],
|
|
379
|
+
"play_count": metrics["play"],
|
|
380
|
+
}
|
|
381
|
+
)
|
|
209
382
|
|
|
210
383
|
missing.extend(validate_work_item(work))
|
|
211
384
|
works.append(work)
|
|
@@ -216,7 +389,7 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
216
389
|
|
|
217
390
|
def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dict[str, Any]], List[Dict[str, str]]]:
|
|
218
391
|
missing: List[Dict[str, str]] = []
|
|
219
|
-
profile_data = raw
|
|
392
|
+
profile_data = _extract_xhs_profile_payload(raw)
|
|
220
393
|
|
|
221
394
|
author_id = _t(_first(profile_data, ["user_id", "userid", "id"], raw.get("resolved_author_id")))
|
|
222
395
|
author_handle = _t(_first(profile_data, ["red_id", "redid", "display_id", "username"]))
|
|
@@ -230,7 +403,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
230
403
|
liked_count=_i(_first(profile_data, ["liked_count", "likes", "total_liked", "like_count"])),
|
|
231
404
|
collected_count=_i(_first(profile_data, ["collected_count", "collect_count", "total_collected", "favorite_count"])),
|
|
232
405
|
signature=_t(_first(profile_data, ["desc", "signature", "bio", "introduction"])),
|
|
233
|
-
avatar_url=
|
|
406
|
+
avatar_url=_extract_xhs_avatar_url(profile_data),
|
|
234
407
|
works_count=_i(_first(profile_data, ["notes", "note_count", "works_count", "post_count"])),
|
|
235
408
|
verified=bool(_first(profile_data, ["official_verified", "verified"], False)),
|
|
236
409
|
snapshot_at=datetime.now().isoformat(timespec="seconds"),
|
|
@@ -245,13 +418,12 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
245
418
|
if not isinstance(item, dict):
|
|
246
419
|
continue
|
|
247
420
|
note_id = _t(_first(item, ["note_id", "id", "item_id"]))
|
|
248
|
-
interact = _first(item, ["interact_info", "interaction_info", "statistics"], {})
|
|
249
421
|
metrics = {
|
|
250
|
-
"like": _i(_first(
|
|
251
|
-
"comment": _i(_first(
|
|
252
|
-
"collect": _i(_first(
|
|
253
|
-
"share": _i(_first(
|
|
254
|
-
"play": _i(_first(
|
|
422
|
+
"like": _i(_first(item, ["liked_count", "like_count", "digg_count"], 0)),
|
|
423
|
+
"comment": _i(_first(item, ["comment_count"], 0)),
|
|
424
|
+
"collect": _i(_first(item, ["collected_count", "collect_count"], 0)),
|
|
425
|
+
"share": _i(_first(item, ["share_count"], 0)),
|
|
426
|
+
"play": _i(_first(item, ["view_count", "play_count"], 0)),
|
|
255
427
|
}
|
|
256
428
|
subtitle_inline = _extract_xhs_subtitle_inline(item)
|
|
257
429
|
subtitle_urls = _extract_xhs_subtitle_urls(item)
|
|
@@ -259,6 +431,9 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
259
431
|
content_type_raw = _t(_first(item, ["type", "note_type", "model_type"]))
|
|
260
432
|
work_modality = _extract_xhs_work_modality(item, video_download_url=video_down_url, subtitle_inline=subtitle_inline)
|
|
261
433
|
content_type = "video" if work_modality == "video" else (content_type_raw or "text")
|
|
434
|
+
cover_image = _extract_xhs_cover_image(item)
|
|
435
|
+
source_url = _extract_xhs_source_url(item, note_id)
|
|
436
|
+
share_url = _extract_xhs_share_url(item, note_id)
|
|
262
437
|
|
|
263
438
|
work = build_work_item(
|
|
264
439
|
platform="xiaohongshu",
|
|
@@ -266,19 +441,19 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
266
441
|
platform_author_id=author_id,
|
|
267
442
|
author_handle=author_handle,
|
|
268
443
|
author_platform_id=author_id,
|
|
269
|
-
title=
|
|
270
|
-
caption_raw=
|
|
444
|
+
title=_extract_xhs_title(item),
|
|
445
|
+
caption_raw=_extract_xhs_caption(item),
|
|
271
446
|
subtitle_raw=subtitle_inline,
|
|
272
447
|
subtitle_source="native_subtitle" if subtitle_inline else "missing",
|
|
273
|
-
publish_time=_t(_first(item, ["publish_time", "time", "create_time"])),
|
|
448
|
+
publish_time=_t(_first(item, ["publish_time", "time", "create_time", "publishTime", "created_at"])),
|
|
274
449
|
work_modality=work_modality,
|
|
275
450
|
content_type=content_type,
|
|
276
451
|
duration_ms=_i(_first(item, ["duration_ms", "duration", "video_duration"], 0)),
|
|
277
|
-
tags=
|
|
452
|
+
tags=_extract_xhs_tags(item),
|
|
278
453
|
metrics=metrics,
|
|
279
|
-
cover_image=
|
|
280
|
-
source_url=
|
|
281
|
-
share_url=
|
|
454
|
+
cover_image=cover_image,
|
|
455
|
+
source_url=source_url,
|
|
456
|
+
share_url=share_url,
|
|
282
457
|
video_download_url=video_down_url,
|
|
283
458
|
asr_status="subtitle_ready" if subtitle_inline else "pending",
|
|
284
459
|
asr_error_reason="",
|
|
@@ -291,6 +466,15 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
291
466
|
"subtitle_urls": subtitle_urls,
|
|
292
467
|
},
|
|
293
468
|
)
|
|
469
|
+
work.update(
|
|
470
|
+
{
|
|
471
|
+
"digg_count": metrics["like"],
|
|
472
|
+
"comment_count": metrics["comment"],
|
|
473
|
+
"collect_count": metrics["collect"],
|
|
474
|
+
"share_count": metrics["share"],
|
|
475
|
+
"play_count": metrics["play"],
|
|
476
|
+
}
|
|
477
|
+
)
|
|
294
478
|
|
|
295
479
|
missing.extend(validate_work_item(work))
|
|
296
480
|
works.append(work)
|
|
@@ -13,9 +13,10 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
|
13
13
|
|
|
14
14
|
import jsonschema
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
SKILL_ROOT = Path(__file__).resolve().parents[3]
|
|
17
|
+
INPUT_SCHEMA_PATH = SKILL_ROOT / "references" / "schemas" / "author-analysis-input-v1.schema.json"
|
|
18
|
+
OUTPUT_SCHEMA_PATH = SKILL_ROOT / "references" / "schemas" / "author-analysis-v2.schema.json"
|
|
19
|
+
PROMPT_CONTRACT_PATH = SKILL_ROOT / "references" / "prompt-contracts" / "author-analysis-v2.md"
|
|
19
20
|
|
|
20
21
|
LOW_HIGH_MID = {"low", "mid", "high"}
|
|
21
22
|
RELATIONSHIP_DISTANCE = {"near", "mid", "far"}
|
|
@@ -49,6 +50,17 @@ STOPWORDS = {
|
|
|
49
50
|
}
|
|
50
51
|
SCHEMA_CACHE: Dict[Path, Dict[str, Any]] = {}
|
|
51
52
|
|
|
53
|
+
|
|
54
|
+
class AnalysisResourceError(RuntimeError):
|
|
55
|
+
def __init__(self, *, code: str, path: Path, detail: str = "") -> None:
|
|
56
|
+
self.code = code
|
|
57
|
+
self.path = path
|
|
58
|
+
self.detail = detail
|
|
59
|
+
message = f"{code}:{path}"
|
|
60
|
+
if detail:
|
|
61
|
+
message = f"{message}:{detail}"
|
|
62
|
+
super().__init__(message)
|
|
63
|
+
|
|
52
64
|
REQUIRED_V2_FIELDS = {
|
|
53
65
|
"author_positioning": ["one_liner", "author_type", "primary_role", "target_audience", "core_problem_solved", "core_value_proposition", "evidence"],
|
|
54
66
|
"trust_model": ["primary_trust_source", "secondary_trust_sources", "trust_building_mechanisms", "trust_risks", "relationship_posture", "evidence"],
|
|
@@ -105,15 +117,15 @@ def _clamp(value: float, low: float, high: float) -> float:
|
|
|
105
117
|
def load_json_schema(path: Path) -> Dict[str, Any]:
|
|
106
118
|
try:
|
|
107
119
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
108
|
-
except Exception:
|
|
109
|
-
|
|
120
|
+
except Exception as error:
|
|
121
|
+
raise AnalysisResourceError(code="schema_load_failed", path=path, detail=f"{type(error).__name__}:{error}") from error
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def prompt_contract_text() -> str:
|
|
113
125
|
try:
|
|
114
126
|
return PROMPT_CONTRACT_PATH.read_text(encoding="utf-8").strip()
|
|
115
|
-
except Exception:
|
|
116
|
-
|
|
127
|
+
except Exception as error:
|
|
128
|
+
raise AnalysisResourceError(code="contract_load_failed", path=PROMPT_CONTRACT_PATH, detail=f"{type(error).__name__}:{error}") from error
|
|
117
129
|
|
|
118
130
|
|
|
119
131
|
def _load_schema(path: Path) -> Dict[str, Any]:
|
|
@@ -128,7 +140,7 @@ def _load_schema(path: Path) -> Dict[str, Any]:
|
|
|
128
140
|
def _schema_errors(payload: Any, path: Path) -> List[Dict[str, str]]:
|
|
129
141
|
schema = _load_schema(path)
|
|
130
142
|
if not schema:
|
|
131
|
-
|
|
143
|
+
raise AnalysisResourceError(code="schema_empty", path=path)
|
|
132
144
|
try:
|
|
133
145
|
validator = jsonschema.Draft202012Validator(schema)
|
|
134
146
|
rows: List[Dict[str, str]] = []
|
|
@@ -152,6 +164,24 @@ def _dedupe_keep_order(values: Sequence[str]) -> List[str]:
|
|
|
152
164
|
return result
|
|
153
165
|
|
|
154
166
|
|
|
167
|
+
def _safe_text_list(value: Any) -> List[str]:
|
|
168
|
+
if not isinstance(value, list):
|
|
169
|
+
return []
|
|
170
|
+
result: List[str] = []
|
|
171
|
+
for item in value:
|
|
172
|
+
if isinstance(item, dict):
|
|
173
|
+
for key in ("name", "value", "label", "hashtag_name", "search_text", "tag_name", "text"):
|
|
174
|
+
text = _safe_text(item.get(key))
|
|
175
|
+
if text:
|
|
176
|
+
result.append(text)
|
|
177
|
+
break
|
|
178
|
+
continue
|
|
179
|
+
text = _safe_text(item)
|
|
180
|
+
if text:
|
|
181
|
+
result.append(text)
|
|
182
|
+
return _dedupe_keep_order(result)
|
|
183
|
+
|
|
184
|
+
|
|
155
185
|
def _dedupe_error_list(errors: Sequence[Dict[str, str]]) -> List[Dict[str, str]]:
|
|
156
186
|
result: List[Dict[str, str]] = []
|
|
157
187
|
seen = set()
|
|
@@ -396,7 +426,7 @@ def _normalize_work(profile: Dict[str, Any], work: Dict[str, Any]) -> Dict[str,
|
|
|
396
426
|
"share_count": share,
|
|
397
427
|
"play_count": play,
|
|
398
428
|
"content_form": _pick_content_form(work),
|
|
399
|
-
"tags":
|
|
429
|
+
"tags": _safe_text_list(work.get("tags")),
|
|
400
430
|
"author_id": _safe_text(profile.get("author_platform_id") or profile.get("platform_author_id")),
|
|
401
431
|
"author_name": _safe_text(profile.get("nickname")) or "作者",
|
|
402
432
|
"performance_score": performance_score,
|
|
@@ -636,12 +666,13 @@ def _compare_bucket_groups(items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
|
636
666
|
return result
|
|
637
667
|
|
|
638
668
|
|
|
639
|
-
def
|
|
669
|
+
def prepare_author_analysis_bundle(*, profile: Dict[str, Any], works: List[Dict[str, Any]], platform: str) -> Dict[str, Any]:
|
|
640
670
|
normalized = [_normalize_work(profile, work) for work in works if isinstance(work, dict)]
|
|
641
671
|
eligible = [item for item in normalized if _safe_text(item.get("analysis_eligibility")) == "eligible"]
|
|
642
672
|
excluded_count = len(normalized) - len(eligible)
|
|
643
673
|
ranked = _assign_buckets(eligible)
|
|
644
674
|
sampled = _sample_standard_works(ranked)
|
|
675
|
+
sampled_work_ids = [_safe_text(item.get("platform_work_id")) for item in sampled if _safe_text(item.get("platform_work_id"))]
|
|
645
676
|
aggregate_stats = {
|
|
646
677
|
"total_works": len(ranked),
|
|
647
678
|
"excluded_works_count": excluded_count,
|
|
@@ -663,7 +694,7 @@ def build_author_analysis_input_v1(*, profile: Dict[str, Any], works: List[Dict[
|
|
|
663
694
|
"global_bucket_distribution": _distribution_from_values([_safe_text(item.get("bucket")) for item in ranked], limit=4),
|
|
664
695
|
"global_top_vs_mid_vs_bottom_deltas": _compare_bucket_groups(ranked),
|
|
665
696
|
}
|
|
666
|
-
|
|
697
|
+
analysis_input = {
|
|
667
698
|
"author_profile": {
|
|
668
699
|
"platform": _safe_text(profile.get("platform")) or platform,
|
|
669
700
|
"platform_author_id": _safe_text(profile.get("author_platform_id") or profile.get("platform_author_id")),
|
|
@@ -696,6 +727,18 @@ def build_author_analysis_input_v1(*, profile: Dict[str, Any], works: List[Dict[
|
|
|
696
727
|
"sampled_works_count": len(sampled),
|
|
697
728
|
},
|
|
698
729
|
}
|
|
730
|
+
return {
|
|
731
|
+
"analysis_input": analysis_input,
|
|
732
|
+
"normalized_works": normalized,
|
|
733
|
+
"ranked_works": ranked,
|
|
734
|
+
"sampled_works": sampled,
|
|
735
|
+
"sampled_work_ids": sampled_work_ids,
|
|
736
|
+
"excluded_works_count": excluded_count,
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def build_author_analysis_input_v1(*, profile: Dict[str, Any], works: List[Dict[str, Any]], platform: str) -> Tuple[Dict[str, Any], List[Dict[str, str]]]:
|
|
741
|
+
payload = prepare_author_analysis_bundle(profile=profile, works=works, platform=platform).get("analysis_input") or {}
|
|
699
742
|
return payload, validate_author_analysis_input_v1(payload)
|
|
700
743
|
|
|
701
744
|
|