birdapi 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bird/_utils.py ADDED
@@ -0,0 +1,491 @@
1
+ """Tweet/user parsing utilities, ported from twitter-client-utils.js."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any, Optional
7
+
8
+ from ._models import (
9
+ ArticleMetadata,
10
+ Author,
11
+ MediaItem,
12
+ Tweet,
13
+ User,
14
+ )
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Handle normalisation
19
+ # ---------------------------------------------------------------------------
20
+
21
+ _HANDLE_RE = re.compile(r"^[A-Za-z0-9_]{1,15}$")
22
+
23
+
24
+ def normalize_handle(raw: Optional[str]) -> Optional[str]:
25
+ if not raw:
26
+ return None
27
+ s = raw.strip().lstrip("@").strip()
28
+ if not s or not _HANDLE_RE.match(s):
29
+ return None
30
+ return s
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # URL extraction helpers
35
+ # ---------------------------------------------------------------------------
36
+
37
+ _TWEET_ID_RE = re.compile(r"/status(?:es)?/(\d+)")
38
+ _LIST_ID_RE = re.compile(r"/lists?/(\d+)")
39
+ _BOOKMARK_FOLDER_ID_RE = re.compile(r"/bookmarks/(\d+)")
40
+
41
+
42
+ def extract_tweet_id(value: str) -> Optional[str]:
43
+ """Return tweet ID from a URL or bare numeric ID."""
44
+ if value.isdigit():
45
+ return value
46
+ m = _TWEET_ID_RE.search(value)
47
+ return m.group(1) if m else None
48
+
49
+
50
+ def extract_list_id(value: str) -> Optional[str]:
51
+ if value.isdigit():
52
+ return value
53
+ m = _LIST_ID_RE.search(value)
54
+ return m.group(1) if m else None
55
+
56
+
57
+ def extract_bookmark_folder_id(value: str) -> Optional[str]:
58
+ if value.isdigit():
59
+ return value
60
+ m = _BOOKMARK_FOLDER_ID_RE.search(value)
61
+ return m.group(1) if m else None
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Draft.js / rich-content rendering
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def _render_block_text(block: dict, entity_map: dict[int, Any]) -> str:
69
+ text: str = block.get("text", "")
70
+ # Apply LINK entities in reverse offset order to preserve positions
71
+ link_ranges = [
72
+ r for r in block.get("entityRanges", [])
73
+ if entity_map.get(r["key"], {}).get("type") == "LINK"
74
+ and entity_map[r["key"]].get("data", {}).get("url")
75
+ ]
76
+ link_ranges.sort(key=lambda r: r["offset"], reverse=True)
77
+ for rng in link_ranges:
78
+ entity = entity_map[rng["key"]]
79
+ url = entity["data"]["url"]
80
+ start, length = rng["offset"], rng["length"]
81
+ link_text = text[start : start + length]
82
+ text = text[:start] + f"[{link_text}]({url})" + text[start + length:]
83
+ return text.strip()
84
+
85
+
86
+ def _render_atomic_block(block: dict, entity_map: dict[int, Any]) -> Optional[str]:
87
+ ranges = block.get("entityRanges", [])
88
+ if not ranges:
89
+ return None
90
+ entity = entity_map.get(ranges[0]["key"])
91
+ if not entity:
92
+ return None
93
+ t = entity.get("type")
94
+ data = entity.get("data", {})
95
+ if t == "MARKDOWN":
96
+ md = data.get("markdown", "")
97
+ return md.strip() if md else None
98
+ if t == "DIVIDER":
99
+ return "---"
100
+ if t == "TWEET" and data.get("tweetId"):
101
+ return f"[Embedded Tweet: https://x.com/i/status/{data['tweetId']}]"
102
+ if t == "LINK" and data.get("url"):
103
+ return f"[Link: {data['url']}]"
104
+ if t == "IMAGE":
105
+ return "[Image]"
106
+ return None
107
+
108
+
109
+ def render_content_state(content_state: Optional[dict]) -> Optional[str]:
110
+ """Render a Draft.js content_state to Markdown-like plain text."""
111
+ if not content_state or not content_state.get("blocks"):
112
+ return None
113
+
114
+ # Build entity lookup
115
+ entity_map: dict[int, Any] = {}
116
+ raw_map = content_state.get("entityMap", [])
117
+ if isinstance(raw_map, list):
118
+ for entry in raw_map:
119
+ try:
120
+ entity_map[int(entry["key"])] = entry["value"]
121
+ except (KeyError, ValueError):
122
+ pass
123
+ elif isinstance(raw_map, dict):
124
+ for k, v in raw_map.items():
125
+ try:
126
+ entity_map[int(k)] = v
127
+ except ValueError:
128
+ pass
129
+
130
+ lines: list[str] = []
131
+ ordered_counter = 0
132
+ prev_type: Optional[str] = None
133
+
134
+ for block in content_state["blocks"]:
135
+ btype = block.get("type", "unstyled")
136
+ if btype != "ordered-list-item" and prev_type == "ordered-list-item":
137
+ ordered_counter = 0
138
+
139
+ if btype == "unstyled":
140
+ text = _render_block_text(block, entity_map)
141
+ if text:
142
+ lines.append(text)
143
+ elif btype == "header-one":
144
+ text = _render_block_text(block, entity_map)
145
+ if text:
146
+ lines.append(f"# {text}")
147
+ elif btype == "header-two":
148
+ text = _render_block_text(block, entity_map)
149
+ if text:
150
+ lines.append(f"## {text}")
151
+ elif btype == "header-three":
152
+ text = _render_block_text(block, entity_map)
153
+ if text:
154
+ lines.append(f"### {text}")
155
+ elif btype == "unordered-list-item":
156
+ text = _render_block_text(block, entity_map)
157
+ if text:
158
+ lines.append(f"- {text}")
159
+ elif btype == "ordered-list-item":
160
+ ordered_counter += 1
161
+ text = _render_block_text(block, entity_map)
162
+ if text:
163
+ lines.append(f"{ordered_counter}. {text}")
164
+ elif btype == "blockquote":
165
+ text = _render_block_text(block, entity_map)
166
+ if text:
167
+ lines.append(f"> {text}")
168
+ elif btype == "atomic":
169
+ content = _render_atomic_block(block, entity_map)
170
+ if content:
171
+ lines.append(content)
172
+ else:
173
+ text = _render_block_text(block, entity_map)
174
+ if text:
175
+ lines.append(text)
176
+
177
+ prev_type = btype
178
+
179
+ result = "\n\n".join(lines).strip()
180
+ return result or None
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # Tweet text extraction
185
+ # ---------------------------------------------------------------------------
186
+
187
+ def _first_text(*values: Any) -> Optional[str]:
188
+ for v in values:
189
+ if isinstance(v, str):
190
+ s = v.strip()
191
+ if s:
192
+ return s
193
+ return None
194
+
195
+
196
+ def _collect_text_fields(value: Any, keys: set[str], output: list[str]) -> None:
197
+ if not value or isinstance(value, str):
198
+ return
199
+ if isinstance(value, list):
200
+ for item in value:
201
+ _collect_text_fields(item, keys, output)
202
+ return
203
+ if isinstance(value, dict):
204
+ for k, nested in value.items():
205
+ if k in keys and isinstance(nested, str):
206
+ s = nested.strip()
207
+ if s:
208
+ output.append(s)
209
+ continue
210
+ _collect_text_fields(nested, keys, output)
211
+
212
+
213
+ def _unique_ordered(values: list[str]) -> list[str]:
214
+ seen: set[str] = set()
215
+ result: list[str] = []
216
+ for v in values:
217
+ if v not in seen:
218
+ seen.add(v)
219
+ result.append(v)
220
+ return result
221
+
222
+
223
+ def _extract_article_text(result: dict) -> Optional[str]:
224
+ article = result.get("article")
225
+ if not article:
226
+ return None
227
+ article_result = article.get("article_results", {}).get("result") or article
228
+ title = _first_text(article_result.get("title"), article.get("title"))
229
+ content_state = article.get("article_results", {}).get("result", {}).get("content_state")
230
+ rich_body = render_content_state(content_state)
231
+ if rich_body:
232
+ if title:
233
+ nt = title.strip()
234
+ tb = rich_body.lstrip()
235
+ headings = [f"# {nt}", f"## {nt}", f"### {nt}"]
236
+ has_title = (
237
+ tb == nt
238
+ or tb.startswith(nt + "\n")
239
+ or any(tb.startswith(h) for h in headings)
240
+ )
241
+ if not has_title:
242
+ return f"{title}\n\n{rich_body}"
243
+ return rich_body
244
+
245
+ # Fallback: plain text
246
+ body = _first_text(
247
+ article_result.get("plain_text"),
248
+ article.get("plain_text"),
249
+ (article_result.get("body") or {}).get("text"),
250
+ (article_result.get("body") or {}).get("richtext", {}).get("text"),
251
+ (article_result.get("body") or {}).get("rich_text", {}).get("text"),
252
+ (article_result.get("content") or {}).get("text"),
253
+ article_result.get("text"),
254
+ )
255
+ if body and title and body.strip() == title.strip():
256
+ body = None
257
+ if not body:
258
+ collected: list[str] = []
259
+ _collect_text_fields(article_result, {"text", "title"}, collected)
260
+ _collect_text_fields(article, {"text", "title"}, collected)
261
+ unique = _unique_ordered(collected)
262
+ filtered = [v for v in unique if v != title] if title else unique
263
+ if filtered:
264
+ body = "\n\n".join(filtered)
265
+ if title and body and not body.startswith(title):
266
+ return f"{title}\n\n{body}"
267
+ return body or title
268
+
269
+
270
+ def _extract_note_tweet_text(result: dict) -> Optional[str]:
271
+ note = (result.get("note_tweet") or {}).get("note_tweet_results", {}).get("result")
272
+ if not note:
273
+ return None
274
+ return _first_text(
275
+ note.get("text"),
276
+ (note.get("richtext") or {}).get("text"),
277
+ (note.get("rich_text") or {}).get("text"),
278
+ (note.get("content") or {}).get("text"),
279
+ )
280
+
281
+
282
+ def _extract_tweet_text(result: dict) -> Optional[str]:
283
+ return (
284
+ _extract_article_text(result)
285
+ or _extract_note_tweet_text(result)
286
+ or _first_text((result.get("legacy") or {}).get("full_text"))
287
+ )
288
+
289
+
290
+ def _extract_article_metadata(result: dict) -> Optional[ArticleMetadata]:
291
+ article = result.get("article")
292
+ if not article:
293
+ return None
294
+ article_result = article.get("article_results", {}).get("result") or article
295
+ title = _first_text(article_result.get("title"), article.get("title"))
296
+ if not title:
297
+ return None
298
+ preview_text = _first_text(
299
+ article_result.get("preview_text"), article.get("preview_text")
300
+ )
301
+ return ArticleMetadata(title=title, preview_text=preview_text)
302
+
303
+
304
+ def _extract_media(result: dict) -> Optional[list[MediaItem]]:
305
+ legacy = result.get("legacy") or {}
306
+ raw_media = (legacy.get("extended_entities") or {}).get("media") or (
307
+ legacy.get("entities") or {}
308
+ ).get("media")
309
+ if not raw_media:
310
+ return None
311
+ items: list[MediaItem] = []
312
+ for item in raw_media:
313
+ if not item.get("type") or not item.get("media_url_https"):
314
+ continue
315
+ media_item = MediaItem(type=item["type"], url=item["media_url_https"])
316
+ sizes = item.get("sizes") or {}
317
+ for size_key in ("large", "medium"):
318
+ sz = sizes.get(size_key)
319
+ if sz:
320
+ media_item.width = sz.get("w")
321
+ media_item.height = sz.get("h")
322
+ break
323
+ if sizes.get("small"):
324
+ media_item.preview_url = f"{item['media_url_https']}:small"
325
+ if item["type"] in ("video", "animated_gif"):
326
+ video_info = item.get("video_info") or {}
327
+ variants = video_info.get("variants") or []
328
+ mp4 = [v for v in variants if v.get("content_type") == "video/mp4" and v.get("url")]
329
+ mp4_with_bitrate = sorted(
330
+ [v for v in mp4 if isinstance(v.get("bitrate"), int)],
331
+ key=lambda v: v["bitrate"],
332
+ reverse=True,
333
+ )
334
+ selected = (mp4_with_bitrate or mp4 or [None])[0]
335
+ if selected:
336
+ media_item.video_url = selected["url"]
337
+ if isinstance(video_info.get("duration_millis"), int):
338
+ media_item.duration_ms = video_info["duration_millis"]
339
+ items.append(media_item)
340
+ return items or None
341
+
342
+
343
+ def _unwrap_tweet_result(result: Optional[dict]) -> Optional[dict]:
344
+ if not result:
345
+ return None
346
+ return result.get("tweet") or result
347
+
348
+
349
+ def map_tweet_result(
350
+ result: Optional[dict],
351
+ quote_depth: int = 1,
352
+ include_raw: bool = False,
353
+ ) -> Optional[Tweet]:
354
+ if not result:
355
+ return None
356
+ user_result = (result.get("core") or {}).get("user_results", {}).get("result") or {}
357
+ user_legacy = user_result.get("legacy") or {}
358
+ user_core = user_result.get("core") or {}
359
+ username = user_legacy.get("screen_name") or user_core.get("screen_name")
360
+ name = user_legacy.get("name") or user_core.get("name") or username
361
+ user_id = user_result.get("rest_id")
362
+
363
+ if not result.get("rest_id") or not username:
364
+ return None
365
+
366
+ text = _extract_tweet_text(result)
367
+ if not text:
368
+ return None
369
+
370
+ quoted_tweet: Optional[Tweet] = None
371
+ if quote_depth > 0:
372
+ quoted_result = _unwrap_tweet_result(
373
+ (result.get("quoted_status_result") or {}).get("result")
374
+ )
375
+ if quoted_result:
376
+ quoted_tweet = map_tweet_result(quoted_result, quote_depth - 1, include_raw)
377
+
378
+ legacy = result.get("legacy") or {}
379
+ tweet = Tweet(
380
+ id=result["rest_id"],
381
+ text=text,
382
+ author=Author(username=username, name=name or username),
383
+ created_at=legacy.get("created_at"),
384
+ reply_count=legacy.get("reply_count"),
385
+ retweet_count=legacy.get("retweet_count"),
386
+ like_count=legacy.get("favorite_count"),
387
+ conversation_id=legacy.get("conversation_id_str"),
388
+ in_reply_to_status_id=legacy.get("in_reply_to_status_id_str") or None,
389
+ author_id=user_id,
390
+ quoted_tweet=quoted_tweet,
391
+ media=_extract_media(result),
392
+ article=_extract_article_metadata(result),
393
+ )
394
+ if include_raw:
395
+ tweet._raw = result
396
+ return tweet
397
+
398
+
399
+ def _collect_tweet_results_from_entry(entry: dict) -> list[dict]:
400
+ results: list[dict] = []
401
+ content = entry.get("content") or {}
402
+
403
+ def push(r: Optional[dict]) -> None:
404
+ if r and r.get("rest_id"):
405
+ results.append(r)
406
+
407
+ push((content.get("itemContent") or {}).get("tweet_results", {}).get("result"))
408
+ push((content.get("item") or {}).get("itemContent", {}).get("tweet_results", {}).get("result"))
409
+ for item in content.get("items") or []:
410
+ push((item.get("item") or {}).get("itemContent", {}).get("tweet_results", {}).get("result"))
411
+ push((item.get("itemContent") or {}).get("tweet_results", {}).get("result"))
412
+ push((item.get("content") or {}).get("itemContent", {}).get("tweet_results", {}).get("result"))
413
+ return results
414
+
415
+
416
+ def parse_tweets_from_instructions(
417
+ instructions: Optional[list],
418
+ quote_depth: int = 1,
419
+ include_raw: bool = False,
420
+ ) -> list[Tweet]:
421
+ tweets: list[Tweet] = []
422
+ seen: set[str] = set()
423
+ for instruction in instructions or []:
424
+ for entry in instruction.get("entries") or []:
425
+ for result in _collect_tweet_results_from_entry(entry):
426
+ mapped = map_tweet_result(result, quote_depth, include_raw)
427
+ if mapped and mapped.id not in seen:
428
+ seen.add(mapped.id)
429
+ tweets.append(mapped)
430
+ return tweets
431
+
432
+
433
+ def extract_cursor_from_instructions(
434
+ instructions: Optional[list],
435
+ cursor_type: str = "Bottom",
436
+ ) -> Optional[str]:
437
+ for instruction in instructions or []:
438
+ for entry in instruction.get("entries") or []:
439
+ content = entry.get("content") or {}
440
+ if content.get("cursorType") == cursor_type and content.get("value"):
441
+ return content["value"]
442
+ return None
443
+
444
+
445
+ def find_tweet_in_instructions(
446
+ instructions: Optional[list],
447
+ tweet_id: str,
448
+ ) -> Optional[dict]:
449
+ for instruction in instructions or []:
450
+ for entry in instruction.get("entries") or []:
451
+ result = (entry.get("content") or {}).get("itemContent", {}).get(
452
+ "tweet_results", {}
453
+ ).get("result")
454
+ if result and result.get("rest_id") == tweet_id:
455
+ return result
456
+ return None
457
+
458
+
459
+ def parse_users_from_instructions(instructions: Optional[list]) -> list[User]:
460
+ users: list[User] = []
461
+ for instruction in instructions or []:
462
+ for entry in instruction.get("entries") or []:
463
+ content = entry.get("content") or {}
464
+ raw = (content.get("itemContent") or {}).get("user_results", {}).get("result")
465
+ if not raw:
466
+ continue
467
+ # Unwrap UserWithVisibilityResults
468
+ if raw.get("__typename") == "UserWithVisibilityResults" and raw.get("user"):
469
+ raw = raw["user"]
470
+ if raw.get("__typename") != "User":
471
+ continue
472
+ legacy = raw.get("legacy") or {}
473
+ core = raw.get("core") or {}
474
+ username = legacy.get("screen_name") or core.get("screen_name")
475
+ if not raw.get("rest_id") or not username:
476
+ continue
477
+ users.append(
478
+ User(
479
+ id=raw["rest_id"],
480
+ username=username,
481
+ name=legacy.get("name") or core.get("name") or username,
482
+ description=legacy.get("description"),
483
+ followers_count=legacy.get("followers_count"),
484
+ following_count=legacy.get("friends_count"),
485
+ is_blue_verified=raw.get("is_blue_verified"),
486
+ profile_image_url=legacy.get("profile_image_url_https")
487
+ or (raw.get("avatar") or {}).get("image_url"),
488
+ created_at=legacy.get("created_at") or core.get("created_at"),
489
+ )
490
+ )
491
+ return users