pyfanedit 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyfanedit/__init__.py +20 -0
- pyfanedit/client.py +294 -0
- pyfanedit/models.py +140 -0
- pyfanedit/parsers.py +690 -0
- pyfanedit/session.py +84 -0
- pyfanedit/version.py +8 -0
- pyfanedit-0.1.1a1.dist-info/METADATA +13 -0
- pyfanedit-0.1.1a1.dist-info/RECORD +10 -0
- pyfanedit-0.1.1a1.dist-info/WHEEL +5 -0
- pyfanedit-0.1.1a1.dist-info/top_level.txt +1 -0
pyfanedit/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pyfanedit.client import CATEGORIES, FaneditClient
|
|
2
|
+
from pyfanedit.models import (
|
|
3
|
+
FaneditDetail, FaneditSummary,
|
|
4
|
+
NewsArticle, Review, ReviewRatings,
|
|
5
|
+
ReviewerEntry, UserReviewEntry,
|
|
6
|
+
)
|
|
7
|
+
from pyfanedit.version import __version__
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"__version__",
|
|
11
|
+
"FaneditClient",
|
|
12
|
+
"FaneditSummary",
|
|
13
|
+
"FaneditDetail",
|
|
14
|
+
"ReviewerEntry",
|
|
15
|
+
"UserReviewEntry",
|
|
16
|
+
"NewsArticle",
|
|
17
|
+
"Review",
|
|
18
|
+
"ReviewRatings",
|
|
19
|
+
"CATEGORIES",
|
|
20
|
+
]
|
pyfanedit/client.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""High-level fanedit.org client."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterator, List, Optional
|
|
5
|
+
|
|
6
|
+
from pyfanedit.models import (
|
|
7
|
+
FaneditDetail, FaneditSummary,
|
|
8
|
+
NewsArticle, ReviewerEntry, UserReviewEntry,
|
|
9
|
+
)
|
|
10
|
+
from pyfanedit.parsers import (
|
|
11
|
+
parse_detail_page, parse_listing_page,
|
|
12
|
+
parse_news_article, parse_news_listing,
|
|
13
|
+
parse_reviewer_rank_page, parse_user_reviews_page,
|
|
14
|
+
)
|
|
15
|
+
from pyfanedit.session import Session
|
|
16
|
+
|
|
17
|
+
# Category slug → URL path
|
|
18
|
+
CATEGORIES = {
|
|
19
|
+
"fanfix": "category/fanedit-listings/fanfix/",
|
|
20
|
+
"fanmix": "category/fanedit-listings/fanmix/",
|
|
21
|
+
"extended": "category/fanedit-listings/extended-edition/",
|
|
22
|
+
"tv_to_movie": "category/fanedit-listings/tv-to-movie/",
|
|
23
|
+
"movie_to_tv": "category/fanedit-listings/movie-to-tv/",
|
|
24
|
+
"shorts": "category/fanedit-listings/shorts/",
|
|
25
|
+
"special": "category/fanedit-listings/custom-special-edition/",
|
|
26
|
+
"documentary": "category/fanedit-listings/documentary-review/",
|
|
27
|
+
"preservation": "category/preservation-listings/",
|
|
28
|
+
"unapproved": "category/unapproved-fanedits/",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
ORDER_CHOICES = ("rdate", "date", "modified", "alpha", "rratio", "rvote")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FaneditClient:
|
|
35
|
+
"""Scraping client for fanedit.org / IFDB."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, impersonate: str = "chrome120", cache_ttl: float = 300.0) -> None:
|
|
38
|
+
self._s = Session(impersonate=impersonate, cache_ttl=cache_ttl)
|
|
39
|
+
|
|
40
|
+
# ------------------------------------------------------------------
|
|
41
|
+
# Category browsing
|
|
42
|
+
# ------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
def get_category(
|
|
45
|
+
self,
|
|
46
|
+
category: str,
|
|
47
|
+
page: int = 1,
|
|
48
|
+
) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
49
|
+
"""Return one page of a category.
|
|
50
|
+
|
|
51
|
+
``category`` may be a key from ``CATEGORIES`` or a full URL path.
|
|
52
|
+
Returns (items, next_page_url).
|
|
53
|
+
"""
|
|
54
|
+
path = CATEGORIES.get(category, category)
|
|
55
|
+
params = {"pg": page} if page > 1 else None
|
|
56
|
+
html = self._s.get(path, params=params)
|
|
57
|
+
return parse_listing_page(html)
|
|
58
|
+
|
|
59
|
+
def iter_category(self, category: str, max_pages: int = 0) -> Iterator[FaneditSummary]:
|
|
60
|
+
"""Yield all fanedits in a category across pages."""
|
|
61
|
+
page = 1
|
|
62
|
+
while True:
|
|
63
|
+
items, next_url = self.get_category(category, page=page)
|
|
64
|
+
yield from items
|
|
65
|
+
if not next_url or (max_pages and page >= max_pages):
|
|
66
|
+
break
|
|
67
|
+
page += 1
|
|
68
|
+
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
# Search
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
def search(
|
|
74
|
+
self,
|
|
75
|
+
keywords: str,
|
|
76
|
+
scope: str = "title",
|
|
77
|
+
query_type: str = "all",
|
|
78
|
+
order: str = "rdate",
|
|
79
|
+
page: int = 1,
|
|
80
|
+
) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
81
|
+
"""Search the IFDB.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
keywords: search terms
|
|
85
|
+
scope: "title" | "reviews"
|
|
86
|
+
query_type: "all" | "any" | "exact"
|
|
87
|
+
order: one of ORDER_CHOICES
|
|
88
|
+
page: 1-based page number
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
(items, next_page_url)
|
|
92
|
+
"""
|
|
93
|
+
params: dict = {
|
|
94
|
+
"query": query_type,
|
|
95
|
+
"scope": scope,
|
|
96
|
+
"keywords": keywords,
|
|
97
|
+
"order": order,
|
|
98
|
+
}
|
|
99
|
+
if page > 1:
|
|
100
|
+
params["pg"] = page
|
|
101
|
+
html = self._s.get("fanedit-search/search-results/", params=params)
|
|
102
|
+
return parse_listing_page(html)
|
|
103
|
+
|
|
104
|
+
def iter_search(
|
|
105
|
+
self,
|
|
106
|
+
keywords: str,
|
|
107
|
+
scope: str = "title",
|
|
108
|
+
query_type: str = "all",
|
|
109
|
+
order: str = "rdate",
|
|
110
|
+
max_pages: int = 0,
|
|
111
|
+
) -> Iterator[FaneditSummary]:
|
|
112
|
+
"""Yield all search results across pages."""
|
|
113
|
+
page = 1
|
|
114
|
+
while True:
|
|
115
|
+
items, next_url = self.search(
|
|
116
|
+
keywords, scope=scope, query_type=query_type, order=order, page=page
|
|
117
|
+
)
|
|
118
|
+
yield from items
|
|
119
|
+
if not next_url or (max_pages and page >= max_pages):
|
|
120
|
+
break
|
|
121
|
+
page += 1
|
|
122
|
+
|
|
123
|
+
# ------------------------------------------------------------------
|
|
124
|
+
# Tag / franchise browsing
|
|
125
|
+
# ------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
def get_by_tag(
|
|
128
|
+
self,
|
|
129
|
+
tag_type: str,
|
|
130
|
+
tag_value: str,
|
|
131
|
+
page: int = 1,
|
|
132
|
+
) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
133
|
+
"""Browse by a tag, e.g. get_by_tag("franchise", "star-wars").
|
|
134
|
+
|
|
135
|
+
Common tag_type values: franchise, faneditorname, originalmovietitle,
|
|
136
|
+
fanedittype, faneditreleasedate, award.
|
|
137
|
+
"""
|
|
138
|
+
path = f"fanedit-search/tag/{tag_type}/{tag_value}/"
|
|
139
|
+
params: dict = {"criteria": "2"}
|
|
140
|
+
if page > 1:
|
|
141
|
+
params["pg"] = page
|
|
142
|
+
html = self._s.get(path, params=params)
|
|
143
|
+
return parse_listing_page(html)
|
|
144
|
+
|
|
145
|
+
def iter_by_tag(
|
|
146
|
+
self, tag_type: str, tag_value: str, max_pages: int = 0
|
|
147
|
+
) -> Iterator[FaneditSummary]:
|
|
148
|
+
page = 1
|
|
149
|
+
while True:
|
|
150
|
+
items, next_url = self.get_by_tag(tag_type, tag_value, page=page)
|
|
151
|
+
yield from items
|
|
152
|
+
if not next_url or (max_pages and page >= max_pages):
|
|
153
|
+
break
|
|
154
|
+
page += 1
|
|
155
|
+
|
|
156
|
+
# ------------------------------------------------------------------
|
|
157
|
+
# Curated lists
|
|
158
|
+
# ------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
def get_latest(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
161
|
+
html = self._s.get("latest-ifdb-fanedits/", params={"pg": page} if page > 1 else None)
|
|
162
|
+
return parse_listing_page(html)
|
|
163
|
+
|
|
164
|
+
def get_top_trusted_rated(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
165
|
+
html = self._s.get("top-trusted-reviewer-rated-fanedits/", params={"pg": page} if page > 1 else None)
|
|
166
|
+
return parse_listing_page(html)
|
|
167
|
+
|
|
168
|
+
def get_top_user_rated(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
169
|
+
html = self._s.get("top-user-rated-fanedits/", params={"pg": page} if page > 1 else None)
|
|
170
|
+
return parse_listing_page(html)
|
|
171
|
+
|
|
172
|
+
def get_most_popular(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
173
|
+
html = self._s.get("most-popular/", params={"pg": page} if page > 1 else None)
|
|
174
|
+
return parse_listing_page(html)
|
|
175
|
+
|
|
176
|
+
def get_award_winners(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
177
|
+
return self.get_by_tag("award", "fanedit-of-the-month", page=page)
|
|
178
|
+
|
|
179
|
+
# ------------------------------------------------------------------
|
|
180
|
+
# Detail
|
|
181
|
+
# ------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
def get_detail(self, url: str) -> FaneditDetail:
|
|
184
|
+
"""Fetch and parse a single fanedit detail page.
|
|
185
|
+
|
|
186
|
+
``url`` may be a full URL or a slug like ``/star-wars-despecialized/``.
|
|
187
|
+
"""
|
|
188
|
+
if not url.startswith("http"):
|
|
189
|
+
url = "https://fanedit.org/" + url.strip("/") + "/"
|
|
190
|
+
html = self._s.get(url)
|
|
191
|
+
return parse_detail_page(html, url)
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------
|
|
194
|
+
# Reviewer leaderboard
|
|
195
|
+
# ------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
def get_reviewer_rank(self, page: int = 1) -> tuple[List[ReviewerEntry], Optional[str]]:
|
|
198
|
+
"""Return one page of the reviewer leaderboard (50 per page, ~3 400 total).
|
|
199
|
+
|
|
200
|
+
Each entry includes numeric user_id, which can be passed directly to
|
|
201
|
+
``get_user_reviews()`` without needing to know the username.
|
|
202
|
+
"""
|
|
203
|
+
html = self._s.get("reviewer-rank/", params={"pg": page} if page > 1 else None)
|
|
204
|
+
return parse_reviewer_rank_page(html)
|
|
205
|
+
|
|
206
|
+
def iter_reviewer_rank(self, max_pages: int = 0) -> Iterator[ReviewerEntry]:
|
|
207
|
+
"""Yield all reviewers from the leaderboard across pages."""
|
|
208
|
+
page = 1
|
|
209
|
+
while True:
|
|
210
|
+
entries, next_url = self.get_reviewer_rank(page=page)
|
|
211
|
+
yield from entries
|
|
212
|
+
if not next_url or (max_pages and page >= max_pages):
|
|
213
|
+
break
|
|
214
|
+
page += 1
|
|
215
|
+
|
|
216
|
+
# ------------------------------------------------------------------
|
|
217
|
+
# Reviews by user
|
|
218
|
+
# ------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
REVIEW_ORDER_CHOICES = (
|
|
221
|
+
"rdate", # most recent
|
|
222
|
+
"date", # oldest
|
|
223
|
+
"rating", # most positive
|
|
224
|
+
"rrating", # most critical
|
|
225
|
+
"updated", # last updated
|
|
226
|
+
"helpful", # most helpful
|
|
227
|
+
"rhelpful", # least helpful
|
|
228
|
+
"discussed", # most discussed
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def get_user_reviews(
|
|
232
|
+
self,
|
|
233
|
+
user_id: int,
|
|
234
|
+
page: int = 1,
|
|
235
|
+
order: str = "rdate",
|
|
236
|
+
) -> tuple[List[UserReviewEntry], Optional[str]]:
|
|
237
|
+
"""Return one page of reviews written by a specific user.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
user_id: numeric jReviews user ID (from ReviewerEntry.user_id or
|
|
241
|
+
the /reviewer-rank/ page anchor id="user-N")
|
|
242
|
+
page: 1-based page number
|
|
243
|
+
order: one of REVIEW_ORDER_CHOICES
|
|
244
|
+
"""
|
|
245
|
+
params: dict = {"order": order}
|
|
246
|
+
if page > 1:
|
|
247
|
+
params["pg"] = page
|
|
248
|
+
html = self._s.get(f"my-reviews/{user_id}/", params=params)
|
|
249
|
+
return parse_user_reviews_page(html)
|
|
250
|
+
|
|
251
|
+
def iter_user_reviews(
|
|
252
|
+
self,
|
|
253
|
+
user_id: int,
|
|
254
|
+
order: str = "rdate",
|
|
255
|
+
max_pages: int = 0,
|
|
256
|
+
) -> Iterator[UserReviewEntry]:
|
|
257
|
+
"""Yield all reviews written by a user across pages."""
|
|
258
|
+
page = 1
|
|
259
|
+
while True:
|
|
260
|
+
reviews, next_url = self.get_user_reviews(user_id, page=page, order=order)
|
|
261
|
+
yield from reviews
|
|
262
|
+
if not next_url or (max_pages and page >= max_pages):
|
|
263
|
+
break
|
|
264
|
+
page += 1
|
|
265
|
+
|
|
266
|
+
def get_latest_user_reviews(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
267
|
+
"""Return the latest user reviews feed (all users)."""
|
|
268
|
+
html = self._s.get("latest-user-reviews/", params={"pg": page} if page > 1 else None)
|
|
269
|
+
return parse_listing_page(html)
|
|
270
|
+
|
|
271
|
+
def get_latest_trusted_reviews(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
|
|
272
|
+
"""Return the latest trusted-reviewer reviews feed."""
|
|
273
|
+
html = self._s.get("latest-trusted-reviewer-reviews/", params={"pg": page} if page > 1 else None)
|
|
274
|
+
return parse_listing_page(html)
|
|
275
|
+
|
|
276
|
+
# ------------------------------------------------------------------
|
|
277
|
+
# News
|
|
278
|
+
# ------------------------------------------------------------------
|
|
279
|
+
|
|
280
|
+
def get_news(self) -> List[NewsArticle]:
|
|
281
|
+
"""Return the news front page article cards (up to ~15 articles)."""
|
|
282
|
+
html = self._s.get("forums/news-publisher/")
|
|
283
|
+
return parse_news_listing(html)
|
|
284
|
+
|
|
285
|
+
def get_news_article(self, url: str) -> NewsArticle:
|
|
286
|
+
"""Fetch a full news article, including body text and mentioned IFDB URLs.
|
|
287
|
+
|
|
288
|
+
``url`` may be a full URL or a forums-relative path like
|
|
289
|
+
``/forums/news-publisher/some-article.185/``.
|
|
290
|
+
"""
|
|
291
|
+
if not url.startswith("http"):
|
|
292
|
+
url = "https://fanedit.org" + url
|
|
293
|
+
html = self._s.get(url)
|
|
294
|
+
return parse_news_article(html, url)
|
pyfanedit/models.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ReviewRatings(BaseModel):
|
|
7
|
+
overall: Optional[float] = None
|
|
8
|
+
audio_video_quality: Optional[float] = None
|
|
9
|
+
audio_editing: Optional[float] = None
|
|
10
|
+
visual_editing: Optional[float] = None
|
|
11
|
+
narrative: Optional[float] = None
|
|
12
|
+
enjoyment: Optional[float] = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Review(BaseModel):
|
|
16
|
+
reviewer: Optional[str] = None
|
|
17
|
+
reviewer_url: Optional[str] = None
|
|
18
|
+
reviewer_rank: Optional[str] = None
|
|
19
|
+
reviewer_review_count: Optional[int] = None
|
|
20
|
+
date: Optional[str] = None
|
|
21
|
+
ratings: ReviewRatings = Field(default_factory=ReviewRatings)
|
|
22
|
+
body: Optional[str] = None
|
|
23
|
+
discussion_url: Optional[str] = None
|
|
24
|
+
helpful_yes: Optional[int] = None
|
|
25
|
+
helpful_no: Optional[int] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FaneditSummary(BaseModel):
|
|
29
|
+
"""Lightweight fanedit as returned from list/search pages."""
|
|
30
|
+
# Canonical IDs
|
|
31
|
+
fanedit_id: Optional[int] = None # WordPress post ID — stable numeric ID
|
|
32
|
+
slug: Optional[str] = None # URL slug, e.g. "star-wars-begins"
|
|
33
|
+
|
|
34
|
+
title: str
|
|
35
|
+
url: str
|
|
36
|
+
cover_url: Optional[str] = None
|
|
37
|
+
faneditor: Optional[str] = None
|
|
38
|
+
original_title: Optional[str] = None
|
|
39
|
+
fanedit_type: Optional[str] = None
|
|
40
|
+
franchise: Optional[str] = None
|
|
41
|
+
release_date: Optional[str] = None
|
|
42
|
+
running_time: Optional[str] = None
|
|
43
|
+
synopsis: Optional[str] = None
|
|
44
|
+
editor_rating: Optional[float] = None
|
|
45
|
+
user_rating: Optional[float] = None
|
|
46
|
+
user_rating_count: Optional[int] = None
|
|
47
|
+
views: Optional[int] = None
|
|
48
|
+
updated: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class FaneditDetail(BaseModel):
|
|
52
|
+
"""Full fanedit detail as parsed from its own page."""
|
|
53
|
+
# Canonical IDs
|
|
54
|
+
fanedit_id: Optional[int] = None # WordPress post ID — stable numeric ID
|
|
55
|
+
slug: Optional[str] = None # URL slug
|
|
56
|
+
|
|
57
|
+
title: str
|
|
58
|
+
url: str
|
|
59
|
+
cover_url: Optional[str] = None
|
|
60
|
+
faneditor: Optional[str] = None
|
|
61
|
+
original_title: Optional[str] = None
|
|
62
|
+
genre: Optional[List[str]] = None
|
|
63
|
+
franchise: Optional[List[str]] = None
|
|
64
|
+
fanedit_type: Optional[str] = None
|
|
65
|
+
|
|
66
|
+
# Source film metadata
|
|
67
|
+
original_release_date: Optional[str] = None
|
|
68
|
+
original_running_time: Optional[str] = None
|
|
69
|
+
imdb_id: Optional[str] = None # e.g. "tt0076759" — from embedded IMDB link
|
|
70
|
+
|
|
71
|
+
# Edit metadata
|
|
72
|
+
fanedit_release_date: Optional[str] = None
|
|
73
|
+
fanedit_running_time: Optional[str] = None
|
|
74
|
+
time_cut: Optional[str] = None
|
|
75
|
+
time_added: Optional[str] = None
|
|
76
|
+
subtitles: Optional[str] = None
|
|
77
|
+
available_in: Optional[str] = None # HD / SD / Surround Sound etc.
|
|
78
|
+
release_information: Optional[str] = None # Digital / Physical etc.
|
|
79
|
+
|
|
80
|
+
# Content
|
|
81
|
+
synopsis: Optional[str] = None
|
|
82
|
+
additional_notes: Optional[str] = None
|
|
83
|
+
special_thanks: Optional[str] = None
|
|
84
|
+
cuts_and_additions: Optional[str] = None
|
|
85
|
+
intention: Optional[str] = None # editor's stated intent
|
|
86
|
+
awards: Optional[str] = None # e.g. Fanedit of the Month
|
|
87
|
+
|
|
88
|
+
# Ratings
|
|
89
|
+
editor_rating: Optional[float] = None
|
|
90
|
+
user_rating: Optional[float] = None
|
|
91
|
+
user_rating_count: Optional[int] = None
|
|
92
|
+
|
|
93
|
+
# Reviews
|
|
94
|
+
editor_reviews: List[Review] = Field(default_factory=list)
|
|
95
|
+
user_reviews: List[Review] = Field(default_factory=list)
|
|
96
|
+
|
|
97
|
+
# Raw overflow for any unmapped fields
|
|
98
|
+
extra_fields: Dict[str, str] = Field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class ReviewerEntry(BaseModel):
|
|
102
|
+
"""One row from the reviewer leaderboard."""
|
|
103
|
+
rank: int
|
|
104
|
+
user_id: int # numeric jReviews user ID (stable)
|
|
105
|
+
username: str
|
|
106
|
+
profile_url: str # /members/{username}/
|
|
107
|
+
reviews_url: str # /my-reviews/{user_id}/
|
|
108
|
+
review_count: int
|
|
109
|
+
helpful_yes: Optional[int] = None
|
|
110
|
+
helpful_pct: Optional[float] = None # e.g. 76.61
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class UserReviewEntry(BaseModel):
|
|
114
|
+
"""One review from a user's review list page."""
|
|
115
|
+
fanedit_title: str
|
|
116
|
+
fanedit_url: str
|
|
117
|
+
fanedit_type: Optional[str] = None
|
|
118
|
+
date: Optional[str] = None
|
|
119
|
+
ratings: ReviewRatings = Field(default_factory=ReviewRatings)
|
|
120
|
+
discussion_url: Optional[str] = None
|
|
121
|
+
comment_count: Optional[int] = None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class NewsArticle(BaseModel):
|
|
125
|
+
"""A news article from the front page or article page."""
|
|
126
|
+
# from listing card
|
|
127
|
+
thread_id: int # XenForo thread ID
|
|
128
|
+
title: str
|
|
129
|
+
url: str # full URL to article
|
|
130
|
+
thumbnail_url: Optional[str] = None
|
|
131
|
+
author: Optional[str] = None
|
|
132
|
+
author_user_id: Optional[int] = None
|
|
133
|
+
published_at: Optional[str] = None # ISO datetime string
|
|
134
|
+
reading_time: Optional[str] = None
|
|
135
|
+
# from article page (only when fetched individually)
|
|
136
|
+
views: Optional[int] = None
|
|
137
|
+
category: Optional[str] = None
|
|
138
|
+
body_html: Optional[str] = None
|
|
139
|
+
body_text: Optional[str] = None
|
|
140
|
+
mentioned_fanedit_urls: List[str] = Field(default_factory=list)
|
pyfanedit/parsers.py
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
"""HTML parsers for fanedit.org pages."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
|
8
|
+
|
|
9
|
+
from pyfanedit.models import (
|
|
10
|
+
FaneditDetail, FaneditSummary, NewsArticle,
|
|
11
|
+
Review, ReviewRatings, ReviewerEntry, UserReviewEntry,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Map of raw label text → FaneditDetail field name
|
|
15
|
+
_DETAIL_FIELD_MAP = {
|
|
16
|
+
"faneditor name:": "faneditor",
|
|
17
|
+
"original movie/show title:": "original_title",
|
|
18
|
+
"fanedit type:": "fanedit_type",
|
|
19
|
+
"original release date:": "original_release_date",
|
|
20
|
+
"original running time:": "original_running_time",
|
|
21
|
+
"fanedit release date:": "fanedit_release_date",
|
|
22
|
+
"fanedit running time:": "fanedit_running_time",
|
|
23
|
+
"time cut:": "time_cut",
|
|
24
|
+
"time added:": "time_added",
|
|
25
|
+
"subtitles available:": "subtitles",
|
|
26
|
+
"available in:": "available_in",
|
|
27
|
+
"synopsis:": "synopsis",
|
|
28
|
+
"additional notes:": "additional_notes",
|
|
29
|
+
"special thanks:": "special_thanks",
|
|
30
|
+
"release information:": "release_information",
|
|
31
|
+
"cuts and additions:": "cuts_and_additions",
|
|
32
|
+
"intention:": "intention",
|
|
33
|
+
"awards:": "awards",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_SUMMARY_FIELD_MAP = {
|
|
37
|
+
"faneditor name:": "faneditor",
|
|
38
|
+
"original movie/show title:": "original_title",
|
|
39
|
+
"fanedit type:": "fanedit_type",
|
|
40
|
+
"fanedit release date:": "release_date",
|
|
41
|
+
"fanedit running time:": "running_time",
|
|
42
|
+
"synopsis:": "synopsis",
|
|
43
|
+
"franchise:": "franchise",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
_REVIEW_RATING_MAP = {
|
|
47
|
+
"overall rating": "overall",
|
|
48
|
+
"audio/video quality": "audio_video_quality",
|
|
49
|
+
"audio editing": "audio_editing",
|
|
50
|
+
"visual editing": "visual_editing",
|
|
51
|
+
"narrative": "narrative",
|
|
52
|
+
"enjoyment": "enjoyment",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_overall_rating(el: Optional[Tag]) -> Tuple[Optional[float], Optional[int]]:
|
|
57
|
+
if el is None:
|
|
58
|
+
return None, None
|
|
59
|
+
val_el = el.find(class_="jrRatingValue")
|
|
60
|
+
if val_el is None:
|
|
61
|
+
return None, None
|
|
62
|
+
text = val_el.get_text(" ", strip=True)
|
|
63
|
+
count = None
|
|
64
|
+
count_m = re.search(r"\((\d+)\)", text)
|
|
65
|
+
if count_m:
|
|
66
|
+
count = int(count_m.group(1))
|
|
67
|
+
rating_m = re.match(r"[\d.]+", text)
|
|
68
|
+
rating = float(rating_m.group()) if rating_m else None
|
|
69
|
+
return rating, count
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _cover_url(soup: BeautifulSoup) -> Optional[str]:
|
|
73
|
+
img = soup.find(class_="jrMediaPhoto")
|
|
74
|
+
if img is None:
|
|
75
|
+
return None
|
|
76
|
+
src = img.get("data-jr-src") or img.get("src", "")
|
|
77
|
+
return None if src.startswith("data:") else (src or None)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _imdb_id(soup: BeautifulSoup) -> Optional[str]:
|
|
81
|
+
for a in soup.find_all("a", href=True):
|
|
82
|
+
m = re.search(r"/(tt\d+)", a["href"])
|
|
83
|
+
if m:
|
|
84
|
+
return m.group(1)
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _wp_post_id(soup: BeautifulSoup) -> Optional[int]:
|
|
89
|
+
body = soup.find("body")
|
|
90
|
+
if body:
|
|
91
|
+
for cls in body.get("class", []):
|
|
92
|
+
m = re.match(r"postid-(\d+)", cls)
|
|
93
|
+
if m:
|
|
94
|
+
return int(m.group(1))
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _slug_from_url(url: str) -> str:
|
|
99
|
+
return url.rstrip("/").rsplit("/", 1)[-1]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _parse_review_ratings(rating_table: Tag) -> ReviewRatings:
|
|
103
|
+
kwargs: dict = {}
|
|
104
|
+
for row in rating_table.find_all(class_="fwd-table-row"):
|
|
105
|
+
cells = row.find_all(class_=["jrRatingLabel", "jrRatingValue"])
|
|
106
|
+
if len(cells) < 2:
|
|
107
|
+
continue
|
|
108
|
+
label = cells[0].get_text(strip=True).lower()
|
|
109
|
+
val_text = cells[-1].get_text(strip=True)
|
|
110
|
+
field = _REVIEW_RATING_MAP.get(label)
|
|
111
|
+
if field:
|
|
112
|
+
try:
|
|
113
|
+
kwargs[field] = float(val_text)
|
|
114
|
+
except ValueError:
|
|
115
|
+
pass
|
|
116
|
+
return ReviewRatings(**kwargs)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _parse_review(review_el: Tag) -> Review:
|
|
120
|
+
# reviewer identity (left panel)
|
|
121
|
+
reviewer = reviewer_url = reviewer_rank = None
|
|
122
|
+
reviewer_review_count = None
|
|
123
|
+
left = review_el.find(class_="jrReviewLayoutLeft")
|
|
124
|
+
if left:
|
|
125
|
+
author_el = left.find(itemprop="name")
|
|
126
|
+
if author_el:
|
|
127
|
+
reviewer = author_el.get_text(strip=True)
|
|
128
|
+
author_url_el = left.find(itemprop="url")
|
|
129
|
+
if author_url_el:
|
|
130
|
+
reviewer_url = author_url_el.get("href")
|
|
131
|
+
rank_el = left.find(class_="jrReviewerRank")
|
|
132
|
+
if rank_el:
|
|
133
|
+
reviewer_rank = rank_el.get_text(strip=True)
|
|
134
|
+
rev_count_el = left.find(class_="jrReviewerReviews")
|
|
135
|
+
if rev_count_el:
|
|
136
|
+
m = re.search(r"(\d+)", rev_count_el.get_text())
|
|
137
|
+
if m:
|
|
138
|
+
reviewer_review_count = int(m.group(1))
|
|
139
|
+
|
|
140
|
+
# date
|
|
141
|
+
date = None
|
|
142
|
+
time_el = review_el.find("time", class_="jrReviewCreated")
|
|
143
|
+
if time_el:
|
|
144
|
+
date = time_el.get("datetime") or time_el.get_text(strip=True)
|
|
145
|
+
|
|
146
|
+
# ratings
|
|
147
|
+
rating_table = review_el.find(class_="jrRatingTable")
|
|
148
|
+
ratings = _parse_review_ratings(rating_table) if rating_table else ReviewRatings()
|
|
149
|
+
|
|
150
|
+
# body text
|
|
151
|
+
body = None
|
|
152
|
+
comment_el = review_el.find(class_="jrReviewComment")
|
|
153
|
+
if comment_el:
|
|
154
|
+
body = comment_el.get_text(" ", strip=True)
|
|
155
|
+
|
|
156
|
+
# discussion / helpful
|
|
157
|
+
discussion_url = None
|
|
158
|
+
helpful_yes = helpful_no = None
|
|
159
|
+
footer = review_el.find(class_="jrReviewActions")
|
|
160
|
+
if footer:
|
|
161
|
+
discuss_a = footer.find(class_="jrDiscussReview")
|
|
162
|
+
if discuss_a:
|
|
163
|
+
discussion_url = discuss_a.get("href")
|
|
164
|
+
vote_el = footer.find(class_="jr-review-vote")
|
|
165
|
+
if vote_el:
|
|
166
|
+
yes_el = vote_el.find(class_="jrVoteYes")
|
|
167
|
+
no_el = vote_el.find(class_="jrVoteNo")
|
|
168
|
+
if yes_el:
|
|
169
|
+
try:
|
|
170
|
+
helpful_yes = int(yes_el.find(class_="count-text").get_text(strip=True))
|
|
171
|
+
except (AttributeError, ValueError):
|
|
172
|
+
pass
|
|
173
|
+
if no_el:
|
|
174
|
+
try:
|
|
175
|
+
helpful_no = int(no_el.find(class_="count-text").get_text(strip=True))
|
|
176
|
+
except (AttributeError, ValueError):
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
return Review(
|
|
180
|
+
reviewer=reviewer,
|
|
181
|
+
reviewer_url=reviewer_url,
|
|
182
|
+
reviewer_rank=reviewer_rank,
|
|
183
|
+
reviewer_review_count=reviewer_review_count,
|
|
184
|
+
date=date,
|
|
185
|
+
ratings=ratings,
|
|
186
|
+
body=body,
|
|
187
|
+
discussion_url=discussion_url,
|
|
188
|
+
helpful_yes=helpful_yes,
|
|
189
|
+
helpful_no=helpful_no,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _parse_reviews(soup: BeautifulSoup, container_class: str) -> List[Review]:
|
|
194
|
+
section = soup.find(class_=container_class)
|
|
195
|
+
if section is None:
|
|
196
|
+
return []
|
|
197
|
+
reviews = []
|
|
198
|
+
for el in section.find_all(class_="jrReviewLayout"):
|
|
199
|
+
# skip the summary placeholder (no jrReviewLayoutLeft = no real reviewer)
|
|
200
|
+
if el.find(class_="jrReviewLayoutLeft") is None:
|
|
201
|
+
continue
|
|
202
|
+
reviews.append(_parse_review(el))
|
|
203
|
+
return reviews
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _parse_outer(outer: Tag) -> Optional[FaneditSummary]:
|
|
207
|
+
title_el = outer.find(class_="jrListingTitle")
|
|
208
|
+
if title_el is None:
|
|
209
|
+
return None
|
|
210
|
+
a = title_el.find("a")
|
|
211
|
+
if a is None:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
title = a.get_text(strip=True)
|
|
215
|
+
url = a["href"]
|
|
216
|
+
slug = _slug_from_url(url)
|
|
217
|
+
|
|
218
|
+
# cover
|
|
219
|
+
cover_url: Optional[str] = None
|
|
220
|
+
thumb = outer.find(class_="jrListingThumbnail")
|
|
221
|
+
if thumb:
|
|
222
|
+
img = thumb.find("img")
|
|
223
|
+
if img:
|
|
224
|
+
src = img.get("data-jr-src") or img.get("src", "")
|
|
225
|
+
if not src.startswith("data:"):
|
|
226
|
+
cover_url = src
|
|
227
|
+
|
|
228
|
+
# ratings
|
|
229
|
+
editor_r, _ = _parse_overall_rating(outer.find(class_="jrOverallEditor"))
|
|
230
|
+
user_r, user_cnt = _parse_overall_rating(outer.find(class_="jrOverallUser"))
|
|
231
|
+
|
|
232
|
+
# views
|
|
233
|
+
views: Optional[int] = None
|
|
234
|
+
for span in outer.find_all("span"):
|
|
235
|
+
if span.find(class_="jrIconGraph"):
|
|
236
|
+
try:
|
|
237
|
+
views = int(span.get_text(strip=True))
|
|
238
|
+
except ValueError:
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
# date
|
|
242
|
+
updated: Optional[str] = None
|
|
243
|
+
date_el = outer.find(class_="jrDateValue")
|
|
244
|
+
if date_el:
|
|
245
|
+
updated = date_el.get_text(strip=True)
|
|
246
|
+
|
|
247
|
+
# fanedit type from jrListingCategory (search results layout)
|
|
248
|
+
fanedit_type: Optional[str] = None
|
|
249
|
+
cat_el = outer.find(class_="jrListingCategory")
|
|
250
|
+
if cat_el:
|
|
251
|
+
fanedit_type = cat_el.get_text(strip=True)
|
|
252
|
+
|
|
253
|
+
# custom fields
|
|
254
|
+
kwargs: dict = {}
|
|
255
|
+
for row in outer.find_all(class_="jrFieldRow"):
|
|
256
|
+
lbl = row.find(class_="jrFieldLabel")
|
|
257
|
+
val = row.find(class_="jrFieldValue")
|
|
258
|
+
if lbl and val:
|
|
259
|
+
key = lbl.get_text(strip=True).lower()
|
|
260
|
+
mapped = _SUMMARY_FIELD_MAP.get(key)
|
|
261
|
+
if mapped:
|
|
262
|
+
kwargs[mapped] = val.get_text(" ", strip=True)
|
|
263
|
+
|
|
264
|
+
if fanedit_type and "fanedit_type" not in kwargs:
|
|
265
|
+
kwargs["fanedit_type"] = fanedit_type
|
|
266
|
+
|
|
267
|
+
return FaneditSummary(
|
|
268
|
+
slug=slug,
|
|
269
|
+
title=title,
|
|
270
|
+
url=url,
|
|
271
|
+
cover_url=cover_url,
|
|
272
|
+
editor_rating=editor_r,
|
|
273
|
+
user_rating=user_r,
|
|
274
|
+
user_rating_count=user_cnt,
|
|
275
|
+
views=views,
|
|
276
|
+
updated=updated,
|
|
277
|
+
**kwargs,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def parse_listing_page(html: str) -> Tuple[List[FaneditSummary], Optional[str]]:
|
|
282
|
+
"""Return (items, next_page_url) from a category/search listing page."""
|
|
283
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
284
|
+
items: List[FaneditSummary] = []
|
|
285
|
+
|
|
286
|
+
# Category pages use "jr-layout-outer"; search/tag pages use "jrRow"
|
|
287
|
+
candidates = soup.find_all(class_="jr-layout-outer") or [
|
|
288
|
+
el for el in soup.find_all(class_="jrRow")
|
|
289
|
+
if "jrDataListHeader" not in (el.get("class") or [])
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
for outer in candidates:
|
|
293
|
+
item = _parse_outer(outer)
|
|
294
|
+
if item:
|
|
295
|
+
items.append(item)
|
|
296
|
+
|
|
297
|
+
# next page
|
|
298
|
+
next_url: Optional[str] = None
|
|
299
|
+
pagenav = soup.find(class_="jrPagination")
|
|
300
|
+
if pagenav:
|
|
301
|
+
current = pagenav.find(class_="jrPageCurrent")
|
|
302
|
+
if current:
|
|
303
|
+
nxt = current.find_next_sibling("a")
|
|
304
|
+
if nxt and nxt.get("href"):
|
|
305
|
+
next_url = nxt["href"]
|
|
306
|
+
|
|
307
|
+
return items, next_url
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def parse_detail_page(html: str, url: str) -> FaneditDetail:
|
|
311
|
+
"""Parse a single fanedit detail page."""
|
|
312
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
313
|
+
|
|
314
|
+
# title
|
|
315
|
+
title_el = soup.find(class_="jrListingTitle") or soup.find("h1", class_="contentheading")
|
|
316
|
+
title = title_el.get_text(strip=True) if title_el else _slug_from_url(url)
|
|
317
|
+
|
|
318
|
+
fanedit_id = _wp_post_id(soup)
|
|
319
|
+
slug = _slug_from_url(url)
|
|
320
|
+
cover_url = _cover_url(soup)
|
|
321
|
+
imdb_id = _imdb_id(soup)
|
|
322
|
+
|
|
323
|
+
# ratings
|
|
324
|
+
editor_r, _ = _parse_overall_rating(soup.find(class_="jrOverallEditor"))
|
|
325
|
+
user_r, user_cnt = _parse_overall_rating(soup.find(class_="jrOverallUser"))
|
|
326
|
+
|
|
327
|
+
# fields — deduplicated by label
|
|
328
|
+
seen: dict = {}
|
|
329
|
+
for row in soup.find_all(class_="jrFieldRow"):
|
|
330
|
+
lbl = row.find(class_="jrFieldLabel")
|
|
331
|
+
val = row.find(class_="jrFieldValue")
|
|
332
|
+
if lbl and val:
|
|
333
|
+
key = lbl.get_text(strip=True).lower()
|
|
334
|
+
if key not in seen:
|
|
335
|
+
seen[key] = val
|
|
336
|
+
|
|
337
|
+
known: dict = {}
|
|
338
|
+
extra: dict = {}
|
|
339
|
+
for raw_key, val_el in seen.items():
|
|
340
|
+
mapped = _DETAIL_FIELD_MAP.get(raw_key)
|
|
341
|
+
text = val_el.get_text(" ", strip=True)
|
|
342
|
+
if mapped:
|
|
343
|
+
known[mapped] = text
|
|
344
|
+
elif raw_key == "genre:":
|
|
345
|
+
known["genre"] = [a.get_text(strip=True) for a in val_el.find_all("a")] or [text]
|
|
346
|
+
elif raw_key == "franchise:":
|
|
347
|
+
known["franchise"] = [a.get_text(strip=True) for a in val_el.find_all("a")] or [text]
|
|
348
|
+
else:
|
|
349
|
+
extra[raw_key] = text
|
|
350
|
+
|
|
351
|
+
editor_reviews = _parse_reviews(soup, "jrEditorReviewsContainer")
|
|
352
|
+
user_reviews = _parse_reviews(soup, "jrUserReviewsContainer")
|
|
353
|
+
|
|
354
|
+
return FaneditDetail(
|
|
355
|
+
fanedit_id=fanedit_id,
|
|
356
|
+
slug=slug,
|
|
357
|
+
title=title,
|
|
358
|
+
url=url,
|
|
359
|
+
cover_url=cover_url,
|
|
360
|
+
imdb_id=imdb_id,
|
|
361
|
+
editor_rating=editor_r,
|
|
362
|
+
user_rating=user_r,
|
|
363
|
+
user_rating_count=user_cnt,
|
|
364
|
+
editor_reviews=editor_reviews,
|
|
365
|
+
user_reviews=user_reviews,
|
|
366
|
+
extra_fields=extra,
|
|
367
|
+
**known,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# ---------------------------------------------------------------------------
|
|
372
|
+
# Reviewer leaderboard
|
|
373
|
+
# ---------------------------------------------------------------------------
|
|
374
|
+
|
|
375
|
+
def parse_reviewer_rank_page(html: str) -> Tuple[List[ReviewerEntry], Optional[str]]:
|
|
376
|
+
"""Parse one page of /reviewer-rank/. Returns (entries, next_page_url)."""
|
|
377
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
378
|
+
entries: List[ReviewerEntry] = []
|
|
379
|
+
|
|
380
|
+
for row in soup.find_all(class_="jrRow"):
|
|
381
|
+
if "jrDataListHeader" in (row.get("class") or []):
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
rank_col = row.find(class_="jrCenterAlign")
|
|
385
|
+
if rank_col is None:
|
|
386
|
+
continue
|
|
387
|
+
user_id_m = re.match(r"user-(\d+)", rank_col.get("id", ""))
|
|
388
|
+
if not user_id_m:
|
|
389
|
+
continue
|
|
390
|
+
user_id = int(user_id_m.group(1))
|
|
391
|
+
try:
|
|
392
|
+
rank = int(rank_col.get_text(strip=True))
|
|
393
|
+
except ValueError:
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
# username + profile URL
|
|
397
|
+
author_el = row.find(class_="jrReviewAuthor")
|
|
398
|
+
if author_el is None:
|
|
399
|
+
continue
|
|
400
|
+
a = author_el.find("a")
|
|
401
|
+
username = a.get_text(strip=True) if a else ""
|
|
402
|
+
profile_url = a["href"] if a else ""
|
|
403
|
+
|
|
404
|
+
# review count + helpful votes
|
|
405
|
+
content = row.find(class_="jrRankContent")
|
|
406
|
+
review_count = 0
|
|
407
|
+
helpful_yes = None
|
|
408
|
+
helpful_pct = None
|
|
409
|
+
if content:
|
|
410
|
+
rev_a = content.find("a")
|
|
411
|
+
if rev_a:
|
|
412
|
+
reviews_url = rev_a["href"]
|
|
413
|
+
m = re.search(r"(\d+)", rev_a.get_text())
|
|
414
|
+
if m:
|
|
415
|
+
review_count = int(m.group(1))
|
|
416
|
+
else:
|
|
417
|
+
reviews_url = f"https://fanedit.org/my-reviews/{user_id}/"
|
|
418
|
+
text = content.get_text(" ", strip=True)
|
|
419
|
+
hm = re.search(r"Helpful votes:\s*(\d+)\s*\(([0-9.]+)%\)", text)
|
|
420
|
+
if hm:
|
|
421
|
+
helpful_yes = int(hm.group(1))
|
|
422
|
+
helpful_pct = float(hm.group(2))
|
|
423
|
+
else:
|
|
424
|
+
reviews_url = f"https://fanedit.org/my-reviews/{user_id}/"
|
|
425
|
+
|
|
426
|
+
entries.append(ReviewerEntry(
|
|
427
|
+
rank=rank,
|
|
428
|
+
user_id=user_id,
|
|
429
|
+
username=username,
|
|
430
|
+
profile_url=profile_url,
|
|
431
|
+
reviews_url=reviews_url,
|
|
432
|
+
review_count=review_count,
|
|
433
|
+
helpful_yes=helpful_yes,
|
|
434
|
+
helpful_pct=helpful_pct,
|
|
435
|
+
))
|
|
436
|
+
|
|
437
|
+
next_url: Optional[str] = None
|
|
438
|
+
pagenav = soup.find(class_="jrPagination")
|
|
439
|
+
if pagenav:
|
|
440
|
+
current = pagenav.find(class_="jrPageCurrent")
|
|
441
|
+
if current:
|
|
442
|
+
nxt = current.find_next_sibling("a")
|
|
443
|
+
if nxt and nxt.get("href"):
|
|
444
|
+
next_url = nxt["href"]
|
|
445
|
+
|
|
446
|
+
return entries, next_url
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
# ---------------------------------------------------------------------------
|
|
450
|
+
# Reviews by user
|
|
451
|
+
# ---------------------------------------------------------------------------
|
|
452
|
+
|
|
453
|
+
def parse_user_reviews_page(html: str) -> Tuple[List[UserReviewEntry], Optional[str]]:
|
|
454
|
+
"""Parse one page of /my-reviews/{user_id}/. Returns (reviews, next_page_url)."""
|
|
455
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
456
|
+
reviews: List[UserReviewEntry] = []
|
|
457
|
+
|
|
458
|
+
for el in soup.find_all(class_="jrReviewListLayout"):
|
|
459
|
+
# fanedit link
|
|
460
|
+
listing_title = el.find(class_="jrListingTitle")
|
|
461
|
+
if listing_title is None:
|
|
462
|
+
continue
|
|
463
|
+
a = listing_title.find("a")
|
|
464
|
+
if a is None:
|
|
465
|
+
continue
|
|
466
|
+
fanedit_url = a["href"]
|
|
467
|
+
if not fanedit_url.startswith("http"):
|
|
468
|
+
fanedit_url = "https://fanedit.org" + fanedit_url
|
|
469
|
+
fanedit_title = a.get_text(strip=True)
|
|
470
|
+
|
|
471
|
+
fanedit_type: Optional[str] = None
|
|
472
|
+
cat_el = el.find(class_="jrListingCategory")
|
|
473
|
+
if cat_el:
|
|
474
|
+
fanedit_type = cat_el.get_text(strip=True)
|
|
475
|
+
|
|
476
|
+
date: Optional[str] = None
|
|
477
|
+
date_el = el.find(class_="jrReviewCreated")
|
|
478
|
+
if date_el:
|
|
479
|
+
date = date_el.get("datetime") or date_el.get_text(strip=True)
|
|
480
|
+
|
|
481
|
+
rating_table = el.find(class_="jrRatingTable")
|
|
482
|
+
ratings = _parse_review_ratings(rating_table) if rating_table else ReviewRatings()
|
|
483
|
+
|
|
484
|
+
discussion_url: Optional[str] = None
|
|
485
|
+
comment_count: Optional[int] = None
|
|
486
|
+
for btn_a in el.find_all("a", class_="jrButton"):
|
|
487
|
+
href = btn_a.get("href", "")
|
|
488
|
+
if "/discussions/" in href:
|
|
489
|
+
discussion_url = href
|
|
490
|
+
m = re.search(r"Comments?\s*\((\d+)\)", btn_a.get_text())
|
|
491
|
+
if m:
|
|
492
|
+
comment_count = int(m.group(1))
|
|
493
|
+
|
|
494
|
+
reviews.append(UserReviewEntry(
|
|
495
|
+
fanedit_title=fanedit_title,
|
|
496
|
+
fanedit_url=fanedit_url,
|
|
497
|
+
fanedit_type=fanedit_type,
|
|
498
|
+
date=date,
|
|
499
|
+
ratings=ratings,
|
|
500
|
+
discussion_url=discussion_url,
|
|
501
|
+
comment_count=comment_count,
|
|
502
|
+
))
|
|
503
|
+
|
|
504
|
+
next_url: Optional[str] = None
|
|
505
|
+
pagenav = soup.find(class_="jrPagination")
|
|
506
|
+
if pagenav:
|
|
507
|
+
current = pagenav.find(class_="jrPageCurrent")
|
|
508
|
+
if current:
|
|
509
|
+
nxt = current.find_next_sibling("a")
|
|
510
|
+
if nxt and nxt.get("href"):
|
|
511
|
+
next_url = nxt["href"]
|
|
512
|
+
|
|
513
|
+
return reviews, next_url
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
# ---------------------------------------------------------------------------
|
|
517
|
+
# News
|
|
518
|
+
# ---------------------------------------------------------------------------
|
|
519
|
+
|
|
520
|
+
def _thread_id_from_card(card: Tag) -> Optional[int]:
|
|
521
|
+
for cls in card.get("class", []):
|
|
522
|
+
m = re.match(r"js-threadListItem-(\d+)", cls)
|
|
523
|
+
if m:
|
|
524
|
+
return int(m.group(1))
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def parse_news_listing(html: str) -> List[NewsArticle]:
|
|
529
|
+
"""Parse the news front page (/forums/news-publisher/)."""
|
|
530
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
531
|
+
articles: List[NewsArticle] = []
|
|
532
|
+
|
|
533
|
+
for card in soup.find_all(class_="newsCard-grid-item"):
|
|
534
|
+
thread_id = _thread_id_from_card(card)
|
|
535
|
+
if thread_id is None:
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
title_el = card.find(class_="newsCard-grid-title")
|
|
539
|
+
if title_el is None:
|
|
540
|
+
continue
|
|
541
|
+
a = title_el.find("a")
|
|
542
|
+
if a is None:
|
|
543
|
+
continue
|
|
544
|
+
title = a.get_text(strip=True)
|
|
545
|
+
href = a["href"]
|
|
546
|
+
url = href if href.startswith("http") else "https://fanedit.org" + href
|
|
547
|
+
|
|
548
|
+
thumbnail_url: Optional[str] = None
|
|
549
|
+
img = card.find("img", class_="newsCard-grid-image-link")
|
|
550
|
+
if img:
|
|
551
|
+
thumbnail_url = img.get("src")
|
|
552
|
+
|
|
553
|
+
author: Optional[str] = None
|
|
554
|
+
author_user_id: Optional[int] = None
|
|
555
|
+
avatar_a = card.find("a", attrs={"data-user-id": True})
|
|
556
|
+
if avatar_a:
|
|
557
|
+
author_img = avatar_a.find("img")
|
|
558
|
+
if author_img:
|
|
559
|
+
author = author_img.get("alt")
|
|
560
|
+
try:
|
|
561
|
+
author_user_id = int(avatar_a["data-user-id"])
|
|
562
|
+
except (ValueError, KeyError):
|
|
563
|
+
pass
|
|
564
|
+
|
|
565
|
+
published_at: Optional[str] = None
|
|
566
|
+
time_el = card.find("time")
|
|
567
|
+
if time_el:
|
|
568
|
+
published_at = time_el.get("datetime")
|
|
569
|
+
|
|
570
|
+
reading_time: Optional[str] = None
|
|
571
|
+
for li in card.find_all("li", class_="newsCard-date"):
|
|
572
|
+
text = li.get_text(strip=True)
|
|
573
|
+
if "min read" in text:
|
|
574
|
+
# strip SVG title prefix ("Reading time2 min read" → "2 min read")
|
|
575
|
+
m = re.search(r"(\d+\s*min read)", text)
|
|
576
|
+
reading_time = m.group(1) if m else text
|
|
577
|
+
|
|
578
|
+
articles.append(NewsArticle(
|
|
579
|
+
thread_id=thread_id,
|
|
580
|
+
title=title,
|
|
581
|
+
url=url,
|
|
582
|
+
thumbnail_url=thumbnail_url,
|
|
583
|
+
author=author,
|
|
584
|
+
author_user_id=author_user_id,
|
|
585
|
+
published_at=published_at,
|
|
586
|
+
reading_time=reading_time,
|
|
587
|
+
))
|
|
588
|
+
|
|
589
|
+
return articles
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def parse_news_article(html: str, url: str) -> NewsArticle:
|
|
593
|
+
"""Parse a single news article page."""
|
|
594
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
595
|
+
|
|
596
|
+
# thread ID from article data attr
|
|
597
|
+
article_el = soup.find("article", class_="newsBody-main")
|
|
598
|
+
thread_id = 0
|
|
599
|
+
if article_el:
|
|
600
|
+
lb_id = article_el.get("data-lb-id", "")
|
|
601
|
+
m = re.search(r"(\d+)", lb_id)
|
|
602
|
+
if m:
|
|
603
|
+
thread_id = int(m.group(1))
|
|
604
|
+
|
|
605
|
+
# title
|
|
606
|
+
title_el = soup.find("h1", class_="p-title-value") or soup.find("h1")
|
|
607
|
+
title = title_el.get_text(strip=True) if title_el else ""
|
|
608
|
+
|
|
609
|
+
# thumbnail
|
|
610
|
+
thumbnail_url: Optional[str] = None
|
|
611
|
+
thumb_img = soup.find("img", class_="newsView-newsThumbnail-header")
|
|
612
|
+
if thumb_img:
|
|
613
|
+
thumbnail_url = thumb_img.get("src")
|
|
614
|
+
|
|
615
|
+
# author + published
|
|
616
|
+
author: Optional[str] = None
|
|
617
|
+
author_user_id: Optional[int] = None
|
|
618
|
+
published_at: Optional[str] = None
|
|
619
|
+
reading_time: Optional[str] = None
|
|
620
|
+
views: Optional[int] = None
|
|
621
|
+
|
|
622
|
+
desc = soup.find(class_="p-description")
|
|
623
|
+
if desc:
|
|
624
|
+
author_a = desc.find("a", attrs={"data-user-id": True})
|
|
625
|
+
if author_a:
|
|
626
|
+
author = author_a.get_text(strip=True)
|
|
627
|
+
try:
|
|
628
|
+
author_user_id = int(author_a["data-user-id"])
|
|
629
|
+
except (ValueError, KeyError):
|
|
630
|
+
pass
|
|
631
|
+
time_el = desc.find("time")
|
|
632
|
+
if time_el:
|
|
633
|
+
published_at = time_el.get("datetime")
|
|
634
|
+
for li in desc.find_all("li"):
|
|
635
|
+
text = li.get_text(strip=True)
|
|
636
|
+
if "min read" in text:
|
|
637
|
+
m = re.search(r"(\d+\s*min read)", text)
|
|
638
|
+
reading_time = m.group(1) if m else text
|
|
639
|
+
|
|
640
|
+
# view count from pairs--justified
|
|
641
|
+
for pair in soup.find_all(class_="pairs--justified"):
|
|
642
|
+
text = pair.get_text(" ", strip=True)
|
|
643
|
+
m = re.search(r"Views\s+([\d,]+)", text)
|
|
644
|
+
if m:
|
|
645
|
+
try:
|
|
646
|
+
views = int(m.group(1).replace(",", ""))
|
|
647
|
+
except ValueError:
|
|
648
|
+
pass
|
|
649
|
+
|
|
650
|
+
# category from breadcrumb
|
|
651
|
+
category: Optional[str] = None
|
|
652
|
+
crumbs = soup.find_all(class_="p-breadcrumbs")
|
|
653
|
+
if not crumbs:
|
|
654
|
+
crumbs = soup.find_all(attrs={"itemprop": "breadcrumb"})
|
|
655
|
+
if crumbs:
|
|
656
|
+
links = crumbs[-1].find_all("a") if crumbs else []
|
|
657
|
+
if links:
|
|
658
|
+
category = links[-1].get_text(strip=True)
|
|
659
|
+
|
|
660
|
+
# body
|
|
661
|
+
body_html: Optional[str] = None
|
|
662
|
+
body_text: Optional[str] = None
|
|
663
|
+
bb = soup.find(class_="bbWrapper")
|
|
664
|
+
if bb:
|
|
665
|
+
body_html = str(bb)
|
|
666
|
+
body_text = bb.get_text(" ", strip=True)
|
|
667
|
+
|
|
668
|
+
# mentioned IFDB fanedit URLs
|
|
669
|
+
mentioned: List[str] = []
|
|
670
|
+
if bb:
|
|
671
|
+
for a in bb.find_all("a", href=True):
|
|
672
|
+
href = a["href"]
|
|
673
|
+
if "fanedit.org" in href and "/forums/" not in href and href not in mentioned:
|
|
674
|
+
mentioned.append(href)
|
|
675
|
+
|
|
676
|
+
return NewsArticle(
|
|
677
|
+
thread_id=thread_id,
|
|
678
|
+
title=title,
|
|
679
|
+
url=url,
|
|
680
|
+
thumbnail_url=thumbnail_url,
|
|
681
|
+
author=author,
|
|
682
|
+
author_user_id=author_user_id,
|
|
683
|
+
published_at=published_at,
|
|
684
|
+
reading_time=reading_time,
|
|
685
|
+
views=views,
|
|
686
|
+
category=category,
|
|
687
|
+
body_html=body_html,
|
|
688
|
+
body_text=body_text,
|
|
689
|
+
mentioned_fanedit_urls=mentioned,
|
|
690
|
+
)
|
pyfanedit/session.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""HTTP layer using curl_cffi to bypass TLS fingerprinting."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import time
|
|
5
|
+
from threading import Lock
|
|
6
|
+
from typing import Any, Dict, Mapping, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
from curl_cffi import requests
|
|
9
|
+
|
|
10
|
+
SITE_URL = "https://fanedit.org"
|
|
11
|
+
_CacheKey = Tuple[str, str, Tuple[Tuple[str, str], ...]]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Session:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
impersonate: str = "chrome120",
|
|
18
|
+
cache_ttl: float = 300.0,
|
|
19
|
+
cache_size: int = 512,
|
|
20
|
+
) -> None:
|
|
21
|
+
self._session = requests.Session(impersonate=impersonate)
|
|
22
|
+
self.cache_ttl = cache_ttl
|
|
23
|
+
self.cache_size = cache_size
|
|
24
|
+
self._cache: Dict[_CacheKey, Tuple[float, str]] = {}
|
|
25
|
+
self._lock = Lock()
|
|
26
|
+
|
|
27
|
+
def _key(self, url: str, params: Optional[Mapping[str, Any]]) -> _CacheKey:
|
|
28
|
+
items = tuple(sorted((k, str(v)) for k, v in (params or {}).items()))
|
|
29
|
+
return ("GET", url, items)
|
|
30
|
+
|
|
31
|
+
def _cache_get(self, key: _CacheKey) -> Optional[str]:
|
|
32
|
+
with self._lock:
|
|
33
|
+
hit = self._cache.get(key)
|
|
34
|
+
if hit is None:
|
|
35
|
+
return None
|
|
36
|
+
ts, text = hit
|
|
37
|
+
if self.cache_ttl and (time.time() - ts) > self.cache_ttl:
|
|
38
|
+
with self._lock:
|
|
39
|
+
self._cache.pop(key, None)
|
|
40
|
+
return None
|
|
41
|
+
return text
|
|
42
|
+
|
|
43
|
+
def _cache_put(self, key: _CacheKey, text: str) -> None:
|
|
44
|
+
if not self.cache_size:
|
|
45
|
+
return
|
|
46
|
+
with self._lock:
|
|
47
|
+
if len(self._cache) >= self.cache_size and self._cache:
|
|
48
|
+
oldest = min(self._cache.items(), key=lambda kv: kv[1][0])[0]
|
|
49
|
+
self._cache.pop(oldest, None)
|
|
50
|
+
self._cache[key] = (time.time(), text)
|
|
51
|
+
|
|
52
|
+
def get(
|
|
53
|
+
self,
|
|
54
|
+
path: str,
|
|
55
|
+
params: Optional[Mapping[str, Any]] = None,
|
|
56
|
+
use_cache: bool = True,
|
|
57
|
+
) -> str:
|
|
58
|
+
url = path if path.startswith("http") else SITE_URL + "/" + path.lstrip("/")
|
|
59
|
+
key = self._key(url, params)
|
|
60
|
+
if use_cache:
|
|
61
|
+
cached = self._cache_get(key)
|
|
62
|
+
if cached is not None:
|
|
63
|
+
return cached
|
|
64
|
+
r = self._session.get(url, params=params, headers={
|
|
65
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
66
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
67
|
+
})
|
|
68
|
+
r.raise_for_status()
|
|
69
|
+
if use_cache:
|
|
70
|
+
self._cache_put(key, r.text)
|
|
71
|
+
return r.text
|
|
72
|
+
|
|
73
|
+
def post(self, path: str, data: Optional[Mapping[str, Any]] = None) -> str:
|
|
74
|
+
url = path if path.startswith("http") else SITE_URL + "/" + path.lstrip("/")
|
|
75
|
+
r = self._session.post(url, data=data, headers={
|
|
76
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
77
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
78
|
+
"Referer": SITE_URL + "/fanedit-search/",
|
|
79
|
+
})
|
|
80
|
+
r.raise_for_status()
|
|
81
|
+
return r.text
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
default_session = Session()
|
pyfanedit/version.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyfanedit
|
|
3
|
+
Version: 0.1.1a1
|
|
4
|
+
Summary: Scraping client for fanedit.org IFDB
|
|
5
|
+
Author-email: JarbasAi <jarbasai@mailfence.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/OpenJarbas/pyfanedit
|
|
8
|
+
Requires-Python: >=3.8
|
|
9
|
+
Requires-Dist: beautifulsoup4
|
|
10
|
+
Requires-Dist: curl_cffi
|
|
11
|
+
Requires-Dist: pydantic>=2.0
|
|
12
|
+
Provides-Extra: test
|
|
13
|
+
Requires-Dist: pytest; extra == "test"
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
pyfanedit/__init__.py,sha256=Sqwkh5LofhWhxJAT9_dWIYvaP5KoqdGqmGtE2oeLgso,455
|
|
2
|
+
pyfanedit/client.py,sha256=oFdXBz3lR-2XV68teHXcwfqphqfD26VPlr3fUPvJNYU,11162
|
|
3
|
+
pyfanedit/models.py,sha256=EbTmHAEkXa1vr284lTTeiKjo4O97rATRf6lq3KgcT6A,4902
|
|
4
|
+
pyfanedit/parsers.py,sha256=gEZd_LZOXRHU8y1Hw0i9hXiw0nb8vYY5joJh36CwPAc,22791
|
|
5
|
+
pyfanedit/session.py,sha256=1dQQ3qNQFzeif3gvNau6lXAPeEGIdGWpcqLwMqti1i4,2891
|
|
6
|
+
pyfanedit/version.py,sha256=SiSP9OTFnysxlBkTvA4uRCQo7NNiTz_VVAXwdU1qfzg,229
|
|
7
|
+
pyfanedit-0.1.1a1.dist-info/METADATA,sha256=F5jpYUD9u1djkHlE0VOEHOHDoxsNhdxQQEdzSswZimk,399
|
|
8
|
+
pyfanedit-0.1.1a1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
pyfanedit-0.1.1a1.dist-info/top_level.txt,sha256=h2lYuDjGN-I_aix96ORMaTbtuWo9Yvsx__E4dBNguyk,10
|
|
10
|
+
pyfanedit-0.1.1a1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pyfanedit
|