reyfetch 1.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
reyfetch/rdouban.py ADDED
@@ -0,0 +1,565 @@
1
+ # !/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ @Time : 2025-08-25 15:37:50
6
+ @Author : Rey
7
+ @Contact : reyxbo@163.com
8
+ @Explain : Douban Web fetch methods.
9
+ """
10
+
11
+
12
+ from typing import TypedDict
13
+ from bs4 import BeautifulSoup
14
+ from reydb import rorm
15
+ from reydb.rdb import Database
16
+ from reykit.rbase import throw
17
+ from reykit.rnet import request
18
+ from reykit.rre import search, findall, sub
19
+
20
+ from .rbase import FetchCrawl
21
+
22
+
23
+ __all__ = (
24
+ 'DatabaseTableDoubanMedia',
25
+ 'FetchCrawlDouban'
26
+ )
27
+
28
+
29
+ MediaRow = TypedDict(
30
+ 'MediaRow',
31
+ {
32
+ 'id': int,
33
+ 'type': str,
34
+ 'name': str,
35
+ 'score': float,
36
+ 'score_count': int,
37
+ 'image': str,
38
+ 'image_low': str,
39
+ 'episode': int | None,
40
+ 'episode_now': int | None,
41
+ 'year': int,
42
+ 'country': list[str],
43
+ 'class': list[str],
44
+ 'director': list[str] | None,
45
+ 'star': list[str] | None
46
+ }
47
+ )
48
+ type MediaTable = list[MediaRow]
49
+ MediaInfo = TypedDict(
50
+ 'MediaInfo',
51
+ {
52
+ 'type': str,
53
+ 'name': str,
54
+ 'year': int | None,
55
+ 'score': float,
56
+ 'score_count': int,
57
+ 'director': list[str] | None,
58
+ 'scriptwriter': list[str] | None,
59
+ 'star': list[str] | None,
60
+ 'class': list[str] | None,
61
+ 'country': list[str] | None,
62
+ 'language': list[str] | None,
63
+ 'premiere': dict[str, str] | None,
64
+ 'episode': int | None,
65
+ 'minute': int | None,
66
+ 'alias': list[str] | None,
67
+ 'imdb': str | None,
68
+ 'comment': list[str],
69
+ 'image': str,
70
+ 'image_low': str
71
+ }
72
+ )
73
+
74
+
75
+ class DatabaseTableDoubanMedia(rorm.Model, table=True):
76
+ """
77
+ Database `douban_media` table model.
78
+ """
79
+
80
+ __name__ = 'douban_media'
81
+ __comment__ = 'Douban media information table.'
82
+ create_time: rorm.Datetime = rorm.Field(field_default=':create_time', not_null=True, index_n=True, comment='Record create time.')
83
+ update_time: rorm.Datetime = rorm.Field(field_default=':update_time', index_n=True, comment='Record update time.')
84
+ id: int = rorm.Field(rorm.types_mysql.INTEGER(unsigned=True), key=True, comment='Douban media ID.')
85
+ imdb: str = rorm.Field(rorm.types.CHAR(10), index_u=True, comment='IMDb ID.')
86
+ type: str = rorm.Field(rorm.types.VARCHAR(5), not_null=True, comment='Media type.')
87
+ name: str = rorm.Field(rorm.types.VARCHAR(50), not_null=True, index_n=True, comment='Media name.')
88
+ year: str = rorm.Field(rorm.types_mysql.YEAR, not_null=True, comment='Media content description.')
89
+ desc: str = rorm.Field(rorm.types.VARCHAR(1000), comment='Media content description.')
90
+ score: float = rorm.Field(rorm.types.FLOAT, comment='Media score, [0,10].')
91
+ score_count: int = rorm.Field(rorm.types_mysql.INTEGER(unsigned=True), comment='Media score count.')
92
+ minute: int = rorm.Field(rorm.types_mysql.SMALLINT(unsigned=True), comment='Movie or TV drama episode minute.')
93
+ episode: int = rorm.Field(rorm.types_mysql.SMALLINT(unsigned=True), comment='TV drama total episode number.')
94
+ episode_now: int = rorm.Field(rorm.types_mysql.SMALLINT(unsigned=True), comment='TV drama current episode number.')
95
+ premiere: str = rorm.Field(rorm.types.JSON, comment='Premiere region and date dictionary.')
96
+ country: str = rorm.Field(rorm.types.JSON, comment='Release country list.')
97
+ class_: str = rorm.Field(rorm.types.JSON, comment='Class list.', arg_name='class', filed_name='class')
98
+ director: str = rorm.Field(rorm.types.JSON, comment='Director list.')
99
+ scriptwriter: str = rorm.Field(rorm.types.JSON, comment='Scriptwriter list.')
100
+ language: str = rorm.Field(rorm.types.JSON, comment='Language list.')
101
+ alias: str = rorm.Field(rorm.types.JSON, comment='Alias list.')
102
+ comment: str = rorm.Field(rorm.types.JSON, comment='Comment list.')
103
+ image: str = rorm.Field(rorm.types.VARCHAR(150), not_null=True, comment='Picture image URL.')
104
+ image_low: str = rorm.Field(rorm.types.VARCHAR(150), not_null=True, comment='Picture image low resolution URL.')
105
+ video: str = rorm.Field(rorm.types.VARCHAR(150), comment='Preview video Douban page URL.')
106
+
107
+
108
+ class FetchCrawlDouban(FetchCrawl):
109
+ """
110
+ Crawl Douban Web fetch type.
111
+ Can create database used `self.build_db` method.
112
+
113
+ Attributes
114
+ ----------
115
+ db_names : Database table name mapping dictionary.
116
+ """
117
+
118
+ db_names = {
119
+ 'douban_media': 'douban_media',
120
+ 'stats_douban': 'stats_douban'
121
+ }
122
+
123
+
124
+ def __init__(self, db: Database | None = None) -> None:
125
+ """
126
+ Build instance attributes.
127
+
128
+ Parameters
129
+ ----------
130
+ db : `Database` instance.
131
+ - `None`: Not use database.
132
+ - `Database`: Automatic record to database.
133
+ """
134
+
135
+ # Build.
136
+ self.db = db
137
+
138
+
139
+ def crawl_table(self) -> MediaTable:
140
+ """
141
+ Crawl media table.
142
+
143
+ Returns
144
+ -------
145
+ Media table.
146
+ """
147
+
148
+ # Parameter.
149
+ url_format = 'https://m.douban.com/rexxar/api/v2/subject/recent_hot/%s'
150
+ referer_format = 'https://movie.douban.com/%s/'
151
+ types_params = (
152
+ ('movie', 'explore', '热门', '华语'),
153
+ ('movie', 'explore', '热门', '欧美'),
154
+ ('movie', 'explore', '热门', '日本'),
155
+ ('movie', 'explore', '热门', '韩国'),
156
+ ('tv', 'tv', 'tv', 'tv_domestic'),
157
+ ('tv', 'tv', 'tv', 'tv_american'),
158
+ ('tv', 'tv', 'tv', 'tv_japanese'),
159
+ ('tv', 'tv', 'tv', 'tv_korean'),
160
+ ('tv', 'tv', 'tv', 'tv_animation'),
161
+ ('tv', 'tv', 'tv', 'tv_documentary'),
162
+ ('tv', 'tv', 'show', 'show_domestic'),
163
+ ('tv', 'tv', 'show', 'show_foreign')
164
+ )
165
+
166
+ # Get.
167
+ table_dict: dict[int, MediaRow] = {}
168
+ for type_params in types_params:
169
+ type_ = type_params[0]
170
+ url = url_format % type_
171
+ referer = referer_format % type_params[1]
172
+ params = {
173
+ 'start': 0,
174
+ 'limit': 1000,
175
+ 'category': type_params[2],
176
+ 'type': type_params[3],
177
+ 'ck': 'Id-j'
178
+ }
179
+ headers = {
180
+ 'referer': referer,
181
+ 'user-agent': self.ua.edge
182
+ }
183
+
184
+ ## Request.
185
+ response = request(
186
+ url,
187
+ params,
188
+ headers=headers,
189
+ check=True
190
+ )
191
+
192
+ ## Extract.
193
+ response_json = response.json()
194
+ items: list[dict] = response_json['items']
195
+ for item in items:
196
+ id_ = int(item['id'])
197
+
198
+ ### Exist.
199
+ if id_ in table_dict:
200
+ continue
201
+
202
+ ### Base.
203
+ row = {
204
+ 'id': id_,
205
+ 'type': type_,
206
+ 'name': item['title'],
207
+ 'score': float(item['rating']['value']),
208
+ 'score_count': int(item['rating']['count']),
209
+ 'image': item['pic']['large'],
210
+ 'image_low': item['pic']['normal']
211
+ }
212
+
213
+ ### Score.
214
+ row['score'] = float(item['rating']['value']) or None
215
+ row['score_count'] = int(item['rating']['count']) or None
216
+
217
+ ### Episode.
218
+ if item['episodes_info'] == '':
219
+ row['episode_now'] = row['episode'] = None
220
+ else:
221
+ row['episode_now'] = search(r'\d+', item['episodes_info'])
222
+ if '全' in item['episodes_info']:
223
+ row['episode'] = row['episode_now']
224
+ else:
225
+ row['episode'] = None
226
+
227
+ ### Information.
228
+ desc = item['card_subtitle'].split(' / ', 4)
229
+ if len(desc) == 5:
230
+ year, countries, classes, directors, stars = desc
231
+ elif len(desc) == 4:
232
+ year, countries, classes, stars = desc
233
+ directors = None
234
+ else:
235
+ year, countries, classes = desc
236
+ directors = None
237
+ stars = None
238
+ row['year'] = int(year)
239
+ row['country'] = countries.split()
240
+ row['class'] = classes.split()
241
+ row['director'] = directors and directors.split()
242
+ row['star'] = stars and stars.split()
243
+
244
+ ### Add.
245
+ table_dict[id_] = row
246
+
247
+ ## Convert.
248
+ table = list(table_dict.values())
249
+
250
+ # Database.
251
+ if self.db is not None:
252
+ update_fields = (
253
+ 'id',
254
+ 'type',
255
+ 'name',
256
+ 'score',
257
+ 'score_count',
258
+ 'image',
259
+ 'image_low',
260
+ 'episode',
261
+ 'episode_now',
262
+ 'year'
263
+ )
264
+ self.db.execute.insert(
265
+ self.db_names['douban_media'],
266
+ table,
267
+ update_fields
268
+ )
269
+
270
+ return table
271
+
272
+
273
+ def crawl_info(self, id_: int) -> MediaInfo:
274
+ """
275
+ Crawl media information.
276
+
277
+ Parameters
278
+ ----------
279
+ id\\_ : Douban media ID.
280
+
281
+ Returns
282
+ -------
283
+ Media information.
284
+ """
285
+
286
+ # Parameter.
287
+ url = f'https://movie.douban.com/subject/{id_}/'
288
+ headers = {'user-agent': self.ua.edge}
289
+
290
+ # Request.
291
+ response = request(
292
+ url,
293
+ headers=headers,
294
+ check=True
295
+ )
296
+
297
+ # Extract.
298
+ html = response.text
299
+ bs = BeautifulSoup(html, 'lxml')
300
+ attrs = {'id': 'info'}
301
+ element = bs.find(attrs=attrs)
302
+ pattern = r'([^\n]+?): ([^\n]+)\n'
303
+ result = findall(pattern, element.text)
304
+ info_dict: dict[str, str] = dict(result)
305
+ split_chars = ' / '
306
+ infos = {}
307
+
308
+ ## Type.
309
+ if (
310
+ 'class="episode_list"' in html
311
+ or '该剧目前还未确定具体集数,如果你知道,欢迎' in bs.find(attrs='article').text
312
+ ):
313
+ infos['type'] = 'tv'
314
+ else:
315
+ infos['type'] = 'movie'
316
+
317
+ ## Name.
318
+ pattern = r'<title>\s*(.+?)\s*\(豆瓣\)\s*</title>'
319
+ infos['name'] = search(pattern, html)
320
+
321
+ ## Year.
322
+ pattern = r'<span class="year">\((\d{4})\)</span>'
323
+ year: str | None = search(pattern, html)
324
+ infos['year'] = year and int(year)
325
+
326
+ ## Description.
327
+ selector = '#link-report-intra span[property="v:summary"]'
328
+ elements = bs.select(selector, limit=1)
329
+ if len(elements) == 0:
330
+ infos['desc'] = None
331
+ else:
332
+ element, = bs.select(selector, limit=1)
333
+ text = element.text.strip()
334
+ pattern = r'\s{2,}'
335
+ infos['desc'] = sub(pattern, text, '')
336
+
337
+ ## Score.
338
+ element = bs.find(attrs='ll rating_num')
339
+ if element.text == '':
340
+ infos['score'] = None
341
+ else:
342
+ infos['score'] = float(element.text)
343
+
344
+ ## Score count.
345
+ if infos['score'] is not None:
346
+ attrs = {'property': 'v:votes'}
347
+ element = bs.find(attrs=attrs)
348
+ infos['score_count'] = int(element.text)
349
+ else:
350
+ infos['score_count'] = None
351
+
352
+ ## Directors.
353
+ directors = info_dict.get('导演')
354
+ infos['director'] = directors and directors.split(split_chars)
355
+
356
+ ## Scriptwriters.
357
+ scriptwriters = info_dict.get('编剧')
358
+ infos['scriptwriter'] = scriptwriters and scriptwriters.split(split_chars)
359
+
360
+ ## Stars.
361
+ stars = info_dict.get('主演')
362
+ infos['star'] = stars and stars.split(split_chars)
363
+
364
+ ## Classes.
365
+ classes = info_dict.get('类型')
366
+ infos['class'] = classes and classes.split(split_chars)
367
+
368
+ ## Countries.
369
+ countries = info_dict.get('制片国家/地区')
370
+ infos['country'] = countries and countries.split(split_chars)
371
+
372
+ ## Languages.
373
+ languages = info_dict.get('语言')
374
+ infos['language'] = languages and languages.split(split_chars)
375
+
376
+ ## Premieres.
377
+ premieres = info_dict.get('上映日期')
378
+ premieres = premieres or info_dict.get('首播')
379
+ infos['premiere'] = premieres and {
380
+ countrie: date
381
+ for premiere in premieres.split(split_chars)
382
+ for date, countrie in (search(r'([^\(]+)\((.+)\)', premiere),)
383
+ }
384
+
385
+ ## Episode.
386
+ episode = info_dict.get('集数')
387
+ infos['episode'] = episode and int(episode)
388
+
389
+ ## Minute.
390
+ minute = info_dict.get('片长')
391
+ minute = minute or info_dict.get('单集片长')
392
+ infos['minute'] = minute and int(search(r'\d+', minute))
393
+
394
+ ## Alias.
395
+ alias = info_dict.get('又名')
396
+ infos['alias'] = alias and alias.split(split_chars)
397
+
398
+ ## IMDb.
399
+ infos['imdb'] = info_dict.get('IMDb')
400
+
401
+ ## Comments.
402
+ selector = '#hot-comments .comment-content'
403
+ elements = bs.select(selector)
404
+ comments = [
405
+ sub(
406
+ r'\s{2,}',
407
+ (
408
+ element.find(attrs='full')
409
+ or element.find(attrs='short')
410
+ ).text.strip(),
411
+ ''
412
+ )
413
+ for element in elements
414
+ ]
415
+ infos['comment'] = comments
416
+
417
+ ## Image.
418
+ selector = '.nbgnbg>img'
419
+ element, = bs.select(selector=selector, limit=1)
420
+ image_url = element.attrs['src']
421
+ infos['image_low'] = image_url.replace('.webp', '.jpg', 1)
422
+ infos['image'] = infos['image_low'].replace('/s_ratio_poster/', '/m_ratio_poster/', 1)
423
+
424
+ ## Video.
425
+ element = bs.find(attrs='related-pic-video')
426
+ if element is None:
427
+ infos['video'] = None
428
+ else:
429
+ url = element.attrs['href']
430
+ infos['video'] = url.replace('#content', '', 1)
431
+
432
+ # Database.
433
+ if self.db is not None:
434
+ data = {'id': id_}
435
+ data.update(infos)
436
+ self.db.execute.insert(
437
+ self.db_names['douban_media'],
438
+ data,
439
+ 'update'
440
+ )
441
+
442
+ return infos
443
+
444
+
445
+ def crawl_video_url(self, url: str) -> str:
446
+ """
447
+ Crawl video download URL from video page URL.
448
+
449
+ Parameters
450
+ ----------
451
+ url : Video page URL.
452
+
453
+ Returns
454
+ -------
455
+ Video download URL.
456
+ """
457
+
458
+ # Request.
459
+ headers = {'user-agent': self.ua.edge}
460
+ response = request(url, headers=headers, check=True)
461
+
462
+ # Extract.
463
+ pattern = r'<source src="([^"]+)"'
464
+ result: str | None = search(pattern, response.text)
465
+
466
+ # Check.
467
+ if result is None:
468
+ throw(AssertionError, result, url)
469
+
470
+ return result
471
+
472
+
473
+ def build_db(self) -> None:
474
+ """
475
+ Check and build database tables, by `self.db_names`.
476
+ """
477
+
478
+ # Check.
479
+ if self.db is None:
480
+ throw(ValueError, self.db)
481
+
482
+ # Parameter.
483
+
484
+ ## Table.
485
+ tables = [DatabaseTableDoubanMedia]
486
+ DatabaseTableDoubanMedia._set_name(self.db_names['douban_media'])
487
+
488
+ ## View stats.
489
+ views_stats = [
490
+ {
491
+ 'path': self.db_names['stats_douban'],
492
+ 'items': [
493
+ {
494
+ 'name': 'count',
495
+ 'select': (
496
+ 'SELECT COUNT(1)\n'
497
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`'
498
+ ),
499
+ 'comment': 'Media count.'
500
+ },
501
+ {
502
+ 'name': 'past_day_count',
503
+ 'select': (
504
+ 'SELECT COUNT(1)\n'
505
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`\n'
506
+ 'WHERE TIMESTAMPDIFF(DAY, `create_time`, NOW()) = 0'
507
+ ),
508
+ 'comment': 'Media count in the past day.'
509
+ },
510
+ {
511
+ 'name': 'past_week_count',
512
+ 'select': (
513
+ 'SELECT COUNT(1)\n'
514
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`\n'
515
+ 'WHERE TIMESTAMPDIFF(DAY, `create_time`, NOW()) <= 6'
516
+ ),
517
+ 'comment': 'Media count in the past week.'
518
+ },
519
+ {
520
+ 'name': 'past_month_count',
521
+ 'select': (
522
+ 'SELECT COUNT(1)\n'
523
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`\n'
524
+ 'WHERE TIMESTAMPDIFF(DAY, `create_time`, NOW()) <= 29'
525
+ ),
526
+ 'comment': 'Media count in the past month.'
527
+ },
528
+ {
529
+ 'name': 'avg_score',
530
+ 'select': (
531
+ 'SELECT ROUND(AVG(`score`), 1)\n'
532
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`'
533
+ ),
534
+ 'comment': 'Media average score.'
535
+ },
536
+ {
537
+ 'name': 'score_count',
538
+ 'select': (
539
+ 'SELECT FORMAT(SUM(`score_count`), 0)\n'
540
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`'
541
+ ),
542
+ 'comment': 'Media score count.'
543
+ },
544
+ {
545
+ 'name': 'last_create_time',
546
+ 'select': (
547
+ 'SELECT MAX(`create_time`)\n'
548
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`'
549
+ ),
550
+ 'comment': 'Media last record create time.'
551
+ },
552
+ {
553
+ 'name': 'last_update_time',
554
+ 'select': (
555
+ 'SELECT IFNULL(MAX(`update_time`), MAX(`create_time`))\n'
556
+ f'FROM `{self.db.database}`.`{self.db_names['douban_media']}`'
557
+ ),
558
+ 'comment': 'Media last record update time.'
559
+ }
560
+ ]
561
+ }
562
+ ]
563
+
564
+ # Build.
565
+ self.db.build.build(tables=tables, views_stats=views_stats, skip=True)