pyfanedit 0.1.1a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyfanedit
3
+ Version: 0.1.1a1
4
+ Summary: Scraping client for fanedit.org IFDB
5
+ Author-email: JarbasAi <jarbasai@mailfence.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/OpenJarbas/pyfanedit
8
+ Requires-Python: >=3.8
9
+ Requires-Dist: beautifulsoup4
10
+ Requires-Dist: curl_cffi
11
+ Requires-Dist: pydantic>=2.0
12
+ Provides-Extra: test
13
+ Requires-Dist: pytest; extra == "test"
@@ -0,0 +1,43 @@
1
+ # pyfanedit
2
+
3
+ Python scraping client for [fanedit.org](https://fanedit.org) (IFDB — the Internet Fanedit Database).
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install pyfanedit
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from pyfanedit import FaneditClient
15
+
16
+ client = FaneditClient()
17
+ results, _ = client.search("star wars")
18
+ detail = client.get_detail(results[0].url)
19
+ print(detail.title, detail.imdb_id, detail.time_cut)
20
+ ```
21
+
22
+ ## Features
23
+
24
+ - Search the IFDB by keyword, scope, and sort order
25
+ - Browse named categories (`fanfix`, `fanmix`, `extended`, `tv_to_movie`, and more)
26
+ - Browse by franchise, editor name, release year, or any other tag
27
+ - Curated lists: latest, top trusted-reviewer rated, top user rated, most popular, award winners
28
+ - Full detail pages: genre, cuts, intention, IMDB ID, editor and user reviews
29
+ - **Reviewer leaderboard** — paginated list of top reviewers with helpful-vote stats
30
+ - **Reviews by user** — all reviews written by a specific user, with eight sort orders
31
+ - **News** — front-page article cards and full article bodies with linked IFDB URLs
32
+ - In-process LRU cache with configurable TTL; thread-safe
33
+
34
+ ## Documentation
35
+
36
+ - [Quick Start](docs/quickstart.md)
37
+ - [API Reference](docs/reference.md)
38
+ - [IDs, IMDB Mapping, and Metadata](docs/ids-and-metadata.md)
39
+ - [Advanced Usage](docs/advanced.md)
40
+
41
+ ## License
42
+
43
+ Apache 2.0
@@ -0,0 +1,20 @@
1
+ from pyfanedit.client import CATEGORIES, FaneditClient
2
+ from pyfanedit.models import (
3
+ FaneditDetail, FaneditSummary,
4
+ NewsArticle, Review, ReviewRatings,
5
+ ReviewerEntry, UserReviewEntry,
6
+ )
7
+ from pyfanedit.version import __version__
8
+
9
+ __all__ = [
10
+ "__version__",
11
+ "FaneditClient",
12
+ "FaneditSummary",
13
+ "FaneditDetail",
14
+ "ReviewerEntry",
15
+ "UserReviewEntry",
16
+ "NewsArticle",
17
+ "Review",
18
+ "ReviewRatings",
19
+ "CATEGORIES",
20
+ ]
@@ -0,0 +1,294 @@
1
+ """High-level fanedit.org client."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Iterator, List, Optional
5
+
6
+ from pyfanedit.models import (
7
+ FaneditDetail, FaneditSummary,
8
+ NewsArticle, ReviewerEntry, UserReviewEntry,
9
+ )
10
+ from pyfanedit.parsers import (
11
+ parse_detail_page, parse_listing_page,
12
+ parse_news_article, parse_news_listing,
13
+ parse_reviewer_rank_page, parse_user_reviews_page,
14
+ )
15
+ from pyfanedit.session import Session
16
+
17
+ # Category slug → URL path
18
+ CATEGORIES = {
19
+ "fanfix": "category/fanedit-listings/fanfix/",
20
+ "fanmix": "category/fanedit-listings/fanmix/",
21
+ "extended": "category/fanedit-listings/extended-edition/",
22
+ "tv_to_movie": "category/fanedit-listings/tv-to-movie/",
23
+ "movie_to_tv": "category/fanedit-listings/movie-to-tv/",
24
+ "shorts": "category/fanedit-listings/shorts/",
25
+ "special": "category/fanedit-listings/custom-special-edition/",
26
+ "documentary": "category/fanedit-listings/documentary-review/",
27
+ "preservation": "category/preservation-listings/",
28
+ "unapproved": "category/unapproved-fanedits/",
29
+ }
30
+
31
+ ORDER_CHOICES = ("rdate", "date", "modified", "alpha", "rratio", "rvote")
32
+
33
+
34
+ class FaneditClient:
35
+ """Scraping client for fanedit.org / IFDB."""
36
+
37
+ def __init__(self, impersonate: str = "chrome120", cache_ttl: float = 300.0) -> None:
38
+ self._s = Session(impersonate=impersonate, cache_ttl=cache_ttl)
39
+
40
+ # ------------------------------------------------------------------
41
+ # Category browsing
42
+ # ------------------------------------------------------------------
43
+
44
+ def get_category(
45
+ self,
46
+ category: str,
47
+ page: int = 1,
48
+ ) -> tuple[List[FaneditSummary], Optional[str]]:
49
+ """Return one page of a category.
50
+
51
+ ``category`` may be a key from ``CATEGORIES`` or a full URL path.
52
+ Returns (items, next_page_url).
53
+ """
54
+ path = CATEGORIES.get(category, category)
55
+ params = {"pg": page} if page > 1 else None
56
+ html = self._s.get(path, params=params)
57
+ return parse_listing_page(html)
58
+
59
+ def iter_category(self, category: str, max_pages: int = 0) -> Iterator[FaneditSummary]:
60
+ """Yield all fanedits in a category across pages."""
61
+ page = 1
62
+ while True:
63
+ items, next_url = self.get_category(category, page=page)
64
+ yield from items
65
+ if not next_url or (max_pages and page >= max_pages):
66
+ break
67
+ page += 1
68
+
69
+ # ------------------------------------------------------------------
70
+ # Search
71
+ # ------------------------------------------------------------------
72
+
73
+ def search(
74
+ self,
75
+ keywords: str,
76
+ scope: str = "title",
77
+ query_type: str = "all",
78
+ order: str = "rdate",
79
+ page: int = 1,
80
+ ) -> tuple[List[FaneditSummary], Optional[str]]:
81
+ """Search the IFDB.
82
+
83
+ Args:
84
+ keywords: search terms
85
+ scope: "title" | "reviews"
86
+ query_type: "all" | "any" | "exact"
87
+ order: one of ORDER_CHOICES
88
+ page: 1-based page number
89
+
90
+ Returns:
91
+ (items, next_page_url)
92
+ """
93
+ params: dict = {
94
+ "query": query_type,
95
+ "scope": scope,
96
+ "keywords": keywords,
97
+ "order": order,
98
+ }
99
+ if page > 1:
100
+ params["pg"] = page
101
+ html = self._s.get("fanedit-search/search-results/", params=params)
102
+ return parse_listing_page(html)
103
+
104
+ def iter_search(
105
+ self,
106
+ keywords: str,
107
+ scope: str = "title",
108
+ query_type: str = "all",
109
+ order: str = "rdate",
110
+ max_pages: int = 0,
111
+ ) -> Iterator[FaneditSummary]:
112
+ """Yield all search results across pages."""
113
+ page = 1
114
+ while True:
115
+ items, next_url = self.search(
116
+ keywords, scope=scope, query_type=query_type, order=order, page=page
117
+ )
118
+ yield from items
119
+ if not next_url or (max_pages and page >= max_pages):
120
+ break
121
+ page += 1
122
+
123
+ # ------------------------------------------------------------------
124
+ # Tag / franchise browsing
125
+ # ------------------------------------------------------------------
126
+
127
+ def get_by_tag(
128
+ self,
129
+ tag_type: str,
130
+ tag_value: str,
131
+ page: int = 1,
132
+ ) -> tuple[List[FaneditSummary], Optional[str]]:
133
+ """Browse by a tag, e.g. get_by_tag("franchise", "star-wars").
134
+
135
+ Common tag_type values: franchise, faneditorname, originalmovietitle,
136
+ fanedittype, faneditreleasedate, award.
137
+ """
138
+ path = f"fanedit-search/tag/{tag_type}/{tag_value}/"
139
+ params: dict = {"criteria": "2"}
140
+ if page > 1:
141
+ params["pg"] = page
142
+ html = self._s.get(path, params=params)
143
+ return parse_listing_page(html)
144
+
145
+ def iter_by_tag(
146
+ self, tag_type: str, tag_value: str, max_pages: int = 0
147
+ ) -> Iterator[FaneditSummary]:
148
+ page = 1
149
+ while True:
150
+ items, next_url = self.get_by_tag(tag_type, tag_value, page=page)
151
+ yield from items
152
+ if not next_url or (max_pages and page >= max_pages):
153
+ break
154
+ page += 1
155
+
156
+ # ------------------------------------------------------------------
157
+ # Curated lists
158
+ # ------------------------------------------------------------------
159
+
160
+ def get_latest(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
161
+ html = self._s.get("latest-ifdb-fanedits/", params={"pg": page} if page > 1 else None)
162
+ return parse_listing_page(html)
163
+
164
+ def get_top_trusted_rated(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
165
+ html = self._s.get("top-trusted-reviewer-rated-fanedits/", params={"pg": page} if page > 1 else None)
166
+ return parse_listing_page(html)
167
+
168
+ def get_top_user_rated(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
169
+ html = self._s.get("top-user-rated-fanedits/", params={"pg": page} if page > 1 else None)
170
+ return parse_listing_page(html)
171
+
172
+ def get_most_popular(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
173
+ html = self._s.get("most-popular/", params={"pg": page} if page > 1 else None)
174
+ return parse_listing_page(html)
175
+
176
+ def get_award_winners(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
177
+ return self.get_by_tag("award", "fanedit-of-the-month", page=page)
178
+
179
+ # ------------------------------------------------------------------
180
+ # Detail
181
+ # ------------------------------------------------------------------
182
+
183
+ def get_detail(self, url: str) -> FaneditDetail:
184
+ """Fetch and parse a single fanedit detail page.
185
+
186
+ ``url`` may be a full URL or a slug like ``/star-wars-despecialized/``.
187
+ """
188
+ if not url.startswith("http"):
189
+ url = "https://fanedit.org/" + url.strip("/") + "/"
190
+ html = self._s.get(url)
191
+ return parse_detail_page(html, url)
192
+
193
+ # ------------------------------------------------------------------
194
+ # Reviewer leaderboard
195
+ # ------------------------------------------------------------------
196
+
197
+ def get_reviewer_rank(self, page: int = 1) -> tuple[List[ReviewerEntry], Optional[str]]:
198
+ """Return one page of the reviewer leaderboard (50 per page, ~3 400 total).
199
+
200
+ Each entry includes numeric user_id, which can be passed directly to
201
+ ``get_user_reviews()`` without needing to know the username.
202
+ """
203
+ html = self._s.get("reviewer-rank/", params={"pg": page} if page > 1 else None)
204
+ return parse_reviewer_rank_page(html)
205
+
206
+ def iter_reviewer_rank(self, max_pages: int = 0) -> Iterator[ReviewerEntry]:
207
+ """Yield all reviewers from the leaderboard across pages."""
208
+ page = 1
209
+ while True:
210
+ entries, next_url = self.get_reviewer_rank(page=page)
211
+ yield from entries
212
+ if not next_url or (max_pages and page >= max_pages):
213
+ break
214
+ page += 1
215
+
216
+ # ------------------------------------------------------------------
217
+ # Reviews by user
218
+ # ------------------------------------------------------------------
219
+
220
+ REVIEW_ORDER_CHOICES = (
221
+ "rdate", # most recent
222
+ "date", # oldest
223
+ "rating", # most positive
224
+ "rrating", # most critical
225
+ "updated", # last updated
226
+ "helpful", # most helpful
227
+ "rhelpful", # least helpful
228
+ "discussed", # most discussed
229
+ )
230
+
231
+ def get_user_reviews(
232
+ self,
233
+ user_id: int,
234
+ page: int = 1,
235
+ order: str = "rdate",
236
+ ) -> tuple[List[UserReviewEntry], Optional[str]]:
237
+ """Return one page of reviews written by a specific user.
238
+
239
+ Args:
240
+ user_id: numeric jReviews user ID (from ReviewerEntry.user_id or
241
+ the /reviewer-rank/ page anchor id="user-N")
242
+ page: 1-based page number
243
+ order: one of REVIEW_ORDER_CHOICES
244
+ """
245
+ params: dict = {"order": order}
246
+ if page > 1:
247
+ params["pg"] = page
248
+ html = self._s.get(f"my-reviews/{user_id}/", params=params)
249
+ return parse_user_reviews_page(html)
250
+
251
+ def iter_user_reviews(
252
+ self,
253
+ user_id: int,
254
+ order: str = "rdate",
255
+ max_pages: int = 0,
256
+ ) -> Iterator[UserReviewEntry]:
257
+ """Yield all reviews written by a user across pages."""
258
+ page = 1
259
+ while True:
260
+ reviews, next_url = self.get_user_reviews(user_id, page=page, order=order)
261
+ yield from reviews
262
+ if not next_url or (max_pages and page >= max_pages):
263
+ break
264
+ page += 1
265
+
266
+ def get_latest_user_reviews(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
267
+ """Return the latest user reviews feed (all users)."""
268
+ html = self._s.get("latest-user-reviews/", params={"pg": page} if page > 1 else None)
269
+ return parse_listing_page(html)
270
+
271
+ def get_latest_trusted_reviews(self, page: int = 1) -> tuple[List[FaneditSummary], Optional[str]]:
272
+ """Return the latest trusted-reviewer reviews feed."""
273
+ html = self._s.get("latest-trusted-reviewer-reviews/", params={"pg": page} if page > 1 else None)
274
+ return parse_listing_page(html)
275
+
276
+ # ------------------------------------------------------------------
277
+ # News
278
+ # ------------------------------------------------------------------
279
+
280
+ def get_news(self) -> List[NewsArticle]:
281
+ """Return the news front page article cards (up to ~15 articles)."""
282
+ html = self._s.get("forums/news-publisher/")
283
+ return parse_news_listing(html)
284
+
285
+ def get_news_article(self, url: str) -> NewsArticle:
286
+ """Fetch a full news article, including body text and mentioned IFDB URLs.
287
+
288
+ ``url`` may be a full URL or a forums-relative path like
289
+ ``/forums/news-publisher/some-article.185/``.
290
+ """
291
+ if not url.startswith("http"):
292
+ url = "https://fanedit.org" + url
293
+ html = self._s.get(url)
294
+ return parse_news_article(html, url)
@@ -0,0 +1,140 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ReviewRatings(BaseModel):
7
+ overall: Optional[float] = None
8
+ audio_video_quality: Optional[float] = None
9
+ audio_editing: Optional[float] = None
10
+ visual_editing: Optional[float] = None
11
+ narrative: Optional[float] = None
12
+ enjoyment: Optional[float] = None
13
+
14
+
15
+ class Review(BaseModel):
16
+ reviewer: Optional[str] = None
17
+ reviewer_url: Optional[str] = None
18
+ reviewer_rank: Optional[str] = None
19
+ reviewer_review_count: Optional[int] = None
20
+ date: Optional[str] = None
21
+ ratings: ReviewRatings = Field(default_factory=ReviewRatings)
22
+ body: Optional[str] = None
23
+ discussion_url: Optional[str] = None
24
+ helpful_yes: Optional[int] = None
25
+ helpful_no: Optional[int] = None
26
+
27
+
28
+ class FaneditSummary(BaseModel):
29
+ """Lightweight fanedit as returned from list/search pages."""
30
+ # Canonical IDs
31
+ fanedit_id: Optional[int] = None # WordPress post ID — stable numeric ID
32
+ slug: Optional[str] = None # URL slug, e.g. "star-wars-begins"
33
+
34
+ title: str
35
+ url: str
36
+ cover_url: Optional[str] = None
37
+ faneditor: Optional[str] = None
38
+ original_title: Optional[str] = None
39
+ fanedit_type: Optional[str] = None
40
+ franchise: Optional[str] = None
41
+ release_date: Optional[str] = None
42
+ running_time: Optional[str] = None
43
+ synopsis: Optional[str] = None
44
+ editor_rating: Optional[float] = None
45
+ user_rating: Optional[float] = None
46
+ user_rating_count: Optional[int] = None
47
+ views: Optional[int] = None
48
+ updated: Optional[str] = None
49
+
50
+
51
+ class FaneditDetail(BaseModel):
52
+ """Full fanedit detail as parsed from its own page."""
53
+ # Canonical IDs
54
+ fanedit_id: Optional[int] = None # WordPress post ID — stable numeric ID
55
+ slug: Optional[str] = None # URL slug
56
+
57
+ title: str
58
+ url: str
59
+ cover_url: Optional[str] = None
60
+ faneditor: Optional[str] = None
61
+ original_title: Optional[str] = None
62
+ genre: Optional[List[str]] = None
63
+ franchise: Optional[List[str]] = None
64
+ fanedit_type: Optional[str] = None
65
+
66
+ # Source film metadata
67
+ original_release_date: Optional[str] = None
68
+ original_running_time: Optional[str] = None
69
+ imdb_id: Optional[str] = None # e.g. "tt0076759" — from embedded IMDB link
70
+
71
+ # Edit metadata
72
+ fanedit_release_date: Optional[str] = None
73
+ fanedit_running_time: Optional[str] = None
74
+ time_cut: Optional[str] = None
75
+ time_added: Optional[str] = None
76
+ subtitles: Optional[str] = None
77
+ available_in: Optional[str] = None # HD / SD / Surround Sound etc.
78
+ release_information: Optional[str] = None # Digital / Physical etc.
79
+
80
+ # Content
81
+ synopsis: Optional[str] = None
82
+ additional_notes: Optional[str] = None
83
+ special_thanks: Optional[str] = None
84
+ cuts_and_additions: Optional[str] = None
85
+ intention: Optional[str] = None # editor's stated intent
86
+ awards: Optional[str] = None # e.g. Fanedit of the Month
87
+
88
+ # Ratings
89
+ editor_rating: Optional[float] = None
90
+ user_rating: Optional[float] = None
91
+ user_rating_count: Optional[int] = None
92
+
93
+ # Reviews
94
+ editor_reviews: List[Review] = Field(default_factory=list)
95
+ user_reviews: List[Review] = Field(default_factory=list)
96
+
97
+ # Raw overflow for any unmapped fields
98
+ extra_fields: Dict[str, str] = Field(default_factory=dict)
99
+
100
+
101
+ class ReviewerEntry(BaseModel):
102
+ """One row from the reviewer leaderboard."""
103
+ rank: int
104
+ user_id: int # numeric jReviews user ID (stable)
105
+ username: str
106
+ profile_url: str # /members/{username}/
107
+ reviews_url: str # /my-reviews/{user_id}/
108
+ review_count: int
109
+ helpful_yes: Optional[int] = None
110
+ helpful_pct: Optional[float] = None # e.g. 76.61
111
+
112
+
113
+ class UserReviewEntry(BaseModel):
114
+ """One review from a user's review list page."""
115
+ fanedit_title: str
116
+ fanedit_url: str
117
+ fanedit_type: Optional[str] = None
118
+ date: Optional[str] = None
119
+ ratings: ReviewRatings = Field(default_factory=ReviewRatings)
120
+ discussion_url: Optional[str] = None
121
+ comment_count: Optional[int] = None
122
+
123
+
124
+ class NewsArticle(BaseModel):
125
+ """A news article from the front page or article page."""
126
+ # from listing card
127
+ thread_id: int # XenForo thread ID
128
+ title: str
129
+ url: str # full URL to article
130
+ thumbnail_url: Optional[str] = None
131
+ author: Optional[str] = None
132
+ author_user_id: Optional[int] = None
133
+ published_at: Optional[str] = None # ISO datetime string
134
+ reading_time: Optional[str] = None
135
+ # from article page (only when fetched individually)
136
+ views: Optional[int] = None
137
+ category: Optional[str] = None
138
+ body_html: Optional[str] = None
139
+ body_text: Optional[str] = None
140
+ mentioned_fanedit_urls: List[str] = Field(default_factory=list)
@@ -0,0 +1,690 @@
1
+ """HTML parsers for fanedit.org pages."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import List, Optional, Tuple
6
+
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+ from pyfanedit.models import (
10
+ FaneditDetail, FaneditSummary, NewsArticle,
11
+ Review, ReviewRatings, ReviewerEntry, UserReviewEntry,
12
+ )
13
+
14
+ # Map of raw label text → FaneditDetail field name
15
+ _DETAIL_FIELD_MAP = {
16
+ "faneditor name:": "faneditor",
17
+ "original movie/show title:": "original_title",
18
+ "fanedit type:": "fanedit_type",
19
+ "original release date:": "original_release_date",
20
+ "original running time:": "original_running_time",
21
+ "fanedit release date:": "fanedit_release_date",
22
+ "fanedit running time:": "fanedit_running_time",
23
+ "time cut:": "time_cut",
24
+ "time added:": "time_added",
25
+ "subtitles available:": "subtitles",
26
+ "available in:": "available_in",
27
+ "synopsis:": "synopsis",
28
+ "additional notes:": "additional_notes",
29
+ "special thanks:": "special_thanks",
30
+ "release information:": "release_information",
31
+ "cuts and additions:": "cuts_and_additions",
32
+ "intention:": "intention",
33
+ "awards:": "awards",
34
+ }
35
+
36
+ _SUMMARY_FIELD_MAP = {
37
+ "faneditor name:": "faneditor",
38
+ "original movie/show title:": "original_title",
39
+ "fanedit type:": "fanedit_type",
40
+ "fanedit release date:": "release_date",
41
+ "fanedit running time:": "running_time",
42
+ "synopsis:": "synopsis",
43
+ "franchise:": "franchise",
44
+ }
45
+
46
+ _REVIEW_RATING_MAP = {
47
+ "overall rating": "overall",
48
+ "audio/video quality": "audio_video_quality",
49
+ "audio editing": "audio_editing",
50
+ "visual editing": "visual_editing",
51
+ "narrative": "narrative",
52
+ "enjoyment": "enjoyment",
53
+ }
54
+
55
+
56
+ def _parse_overall_rating(el: Optional[Tag]) -> Tuple[Optional[float], Optional[int]]:
57
+ if el is None:
58
+ return None, None
59
+ val_el = el.find(class_="jrRatingValue")
60
+ if val_el is None:
61
+ return None, None
62
+ text = val_el.get_text(" ", strip=True)
63
+ count = None
64
+ count_m = re.search(r"\((\d+)\)", text)
65
+ if count_m:
66
+ count = int(count_m.group(1))
67
+ rating_m = re.match(r"[\d.]+", text)
68
+ rating = float(rating_m.group()) if rating_m else None
69
+ return rating, count
70
+
71
+
72
+ def _cover_url(soup: BeautifulSoup) -> Optional[str]:
73
+ img = soup.find(class_="jrMediaPhoto")
74
+ if img is None:
75
+ return None
76
+ src = img.get("data-jr-src") or img.get("src", "")
77
+ return None if src.startswith("data:") else (src or None)
78
+
79
+
80
+ def _imdb_id(soup: BeautifulSoup) -> Optional[str]:
81
+ for a in soup.find_all("a", href=True):
82
+ m = re.search(r"/(tt\d+)", a["href"])
83
+ if m:
84
+ return m.group(1)
85
+ return None
86
+
87
+
88
+ def _wp_post_id(soup: BeautifulSoup) -> Optional[int]:
89
+ body = soup.find("body")
90
+ if body:
91
+ for cls in body.get("class", []):
92
+ m = re.match(r"postid-(\d+)", cls)
93
+ if m:
94
+ return int(m.group(1))
95
+ return None
96
+
97
+
98
+ def _slug_from_url(url: str) -> str:
99
+ return url.rstrip("/").rsplit("/", 1)[-1]
100
+
101
+
102
+ def _parse_review_ratings(rating_table: Tag) -> ReviewRatings:
103
+ kwargs: dict = {}
104
+ for row in rating_table.find_all(class_="fwd-table-row"):
105
+ cells = row.find_all(class_=["jrRatingLabel", "jrRatingValue"])
106
+ if len(cells) < 2:
107
+ continue
108
+ label = cells[0].get_text(strip=True).lower()
109
+ val_text = cells[-1].get_text(strip=True)
110
+ field = _REVIEW_RATING_MAP.get(label)
111
+ if field:
112
+ try:
113
+ kwargs[field] = float(val_text)
114
+ except ValueError:
115
+ pass
116
+ return ReviewRatings(**kwargs)
117
+
118
+
119
+ def _parse_review(review_el: Tag) -> Review:
120
+ # reviewer identity (left panel)
121
+ reviewer = reviewer_url = reviewer_rank = None
122
+ reviewer_review_count = None
123
+ left = review_el.find(class_="jrReviewLayoutLeft")
124
+ if left:
125
+ author_el = left.find(itemprop="name")
126
+ if author_el:
127
+ reviewer = author_el.get_text(strip=True)
128
+ author_url_el = left.find(itemprop="url")
129
+ if author_url_el:
130
+ reviewer_url = author_url_el.get("href")
131
+ rank_el = left.find(class_="jrReviewerRank")
132
+ if rank_el:
133
+ reviewer_rank = rank_el.get_text(strip=True)
134
+ rev_count_el = left.find(class_="jrReviewerReviews")
135
+ if rev_count_el:
136
+ m = re.search(r"(\d+)", rev_count_el.get_text())
137
+ if m:
138
+ reviewer_review_count = int(m.group(1))
139
+
140
+ # date
141
+ date = None
142
+ time_el = review_el.find("time", class_="jrReviewCreated")
143
+ if time_el:
144
+ date = time_el.get("datetime") or time_el.get_text(strip=True)
145
+
146
+ # ratings
147
+ rating_table = review_el.find(class_="jrRatingTable")
148
+ ratings = _parse_review_ratings(rating_table) if rating_table else ReviewRatings()
149
+
150
+ # body text
151
+ body = None
152
+ comment_el = review_el.find(class_="jrReviewComment")
153
+ if comment_el:
154
+ body = comment_el.get_text(" ", strip=True)
155
+
156
+ # discussion / helpful
157
+ discussion_url = None
158
+ helpful_yes = helpful_no = None
159
+ footer = review_el.find(class_="jrReviewActions")
160
+ if footer:
161
+ discuss_a = footer.find(class_="jrDiscussReview")
162
+ if discuss_a:
163
+ discussion_url = discuss_a.get("href")
164
+ vote_el = footer.find(class_="jr-review-vote")
165
+ if vote_el:
166
+ yes_el = vote_el.find(class_="jrVoteYes")
167
+ no_el = vote_el.find(class_="jrVoteNo")
168
+ if yes_el:
169
+ try:
170
+ helpful_yes = int(yes_el.find(class_="count-text").get_text(strip=True))
171
+ except (AttributeError, ValueError):
172
+ pass
173
+ if no_el:
174
+ try:
175
+ helpful_no = int(no_el.find(class_="count-text").get_text(strip=True))
176
+ except (AttributeError, ValueError):
177
+ pass
178
+
179
+ return Review(
180
+ reviewer=reviewer,
181
+ reviewer_url=reviewer_url,
182
+ reviewer_rank=reviewer_rank,
183
+ reviewer_review_count=reviewer_review_count,
184
+ date=date,
185
+ ratings=ratings,
186
+ body=body,
187
+ discussion_url=discussion_url,
188
+ helpful_yes=helpful_yes,
189
+ helpful_no=helpful_no,
190
+ )
191
+
192
+
193
+ def _parse_reviews(soup: BeautifulSoup, container_class: str) -> List[Review]:
194
+ section = soup.find(class_=container_class)
195
+ if section is None:
196
+ return []
197
+ reviews = []
198
+ for el in section.find_all(class_="jrReviewLayout"):
199
+ # skip the summary placeholder (no jrReviewLayoutLeft = no real reviewer)
200
+ if el.find(class_="jrReviewLayoutLeft") is None:
201
+ continue
202
+ reviews.append(_parse_review(el))
203
+ return reviews
204
+
205
+
206
+ def _parse_outer(outer: Tag) -> Optional[FaneditSummary]:
207
+ title_el = outer.find(class_="jrListingTitle")
208
+ if title_el is None:
209
+ return None
210
+ a = title_el.find("a")
211
+ if a is None:
212
+ return None
213
+
214
+ title = a.get_text(strip=True)
215
+ url = a["href"]
216
+ slug = _slug_from_url(url)
217
+
218
+ # cover
219
+ cover_url: Optional[str] = None
220
+ thumb = outer.find(class_="jrListingThumbnail")
221
+ if thumb:
222
+ img = thumb.find("img")
223
+ if img:
224
+ src = img.get("data-jr-src") or img.get("src", "")
225
+ if not src.startswith("data:"):
226
+ cover_url = src
227
+
228
+ # ratings
229
+ editor_r, _ = _parse_overall_rating(outer.find(class_="jrOverallEditor"))
230
+ user_r, user_cnt = _parse_overall_rating(outer.find(class_="jrOverallUser"))
231
+
232
+ # views
233
+ views: Optional[int] = None
234
+ for span in outer.find_all("span"):
235
+ if span.find(class_="jrIconGraph"):
236
+ try:
237
+ views = int(span.get_text(strip=True))
238
+ except ValueError:
239
+ pass
240
+
241
+ # date
242
+ updated: Optional[str] = None
243
+ date_el = outer.find(class_="jrDateValue")
244
+ if date_el:
245
+ updated = date_el.get_text(strip=True)
246
+
247
+ # fanedit type from jrListingCategory (search results layout)
248
+ fanedit_type: Optional[str] = None
249
+ cat_el = outer.find(class_="jrListingCategory")
250
+ if cat_el:
251
+ fanedit_type = cat_el.get_text(strip=True)
252
+
253
+ # custom fields
254
+ kwargs: dict = {}
255
+ for row in outer.find_all(class_="jrFieldRow"):
256
+ lbl = row.find(class_="jrFieldLabel")
257
+ val = row.find(class_="jrFieldValue")
258
+ if lbl and val:
259
+ key = lbl.get_text(strip=True).lower()
260
+ mapped = _SUMMARY_FIELD_MAP.get(key)
261
+ if mapped:
262
+ kwargs[mapped] = val.get_text(" ", strip=True)
263
+
264
+ if fanedit_type and "fanedit_type" not in kwargs:
265
+ kwargs["fanedit_type"] = fanedit_type
266
+
267
+ return FaneditSummary(
268
+ slug=slug,
269
+ title=title,
270
+ url=url,
271
+ cover_url=cover_url,
272
+ editor_rating=editor_r,
273
+ user_rating=user_r,
274
+ user_rating_count=user_cnt,
275
+ views=views,
276
+ updated=updated,
277
+ **kwargs,
278
+ )
279
+
280
+
281
+ def parse_listing_page(html: str) -> Tuple[List[FaneditSummary], Optional[str]]:
282
+ """Return (items, next_page_url) from a category/search listing page."""
283
+ soup = BeautifulSoup(html, "html.parser")
284
+ items: List[FaneditSummary] = []
285
+
286
+ # Category pages use "jr-layout-outer"; search/tag pages use "jrRow"
287
+ candidates = soup.find_all(class_="jr-layout-outer") or [
288
+ el for el in soup.find_all(class_="jrRow")
289
+ if "jrDataListHeader" not in (el.get("class") or [])
290
+ ]
291
+
292
+ for outer in candidates:
293
+ item = _parse_outer(outer)
294
+ if item:
295
+ items.append(item)
296
+
297
+ # next page
298
+ next_url: Optional[str] = None
299
+ pagenav = soup.find(class_="jrPagination")
300
+ if pagenav:
301
+ current = pagenav.find(class_="jrPageCurrent")
302
+ if current:
303
+ nxt = current.find_next_sibling("a")
304
+ if nxt and nxt.get("href"):
305
+ next_url = nxt["href"]
306
+
307
+ return items, next_url
308
+
309
+
310
+ def parse_detail_page(html: str, url: str) -> FaneditDetail:
311
+ """Parse a single fanedit detail page."""
312
+ soup = BeautifulSoup(html, "html.parser")
313
+
314
+ # title
315
+ title_el = soup.find(class_="jrListingTitle") or soup.find("h1", class_="contentheading")
316
+ title = title_el.get_text(strip=True) if title_el else _slug_from_url(url)
317
+
318
+ fanedit_id = _wp_post_id(soup)
319
+ slug = _slug_from_url(url)
320
+ cover_url = _cover_url(soup)
321
+ imdb_id = _imdb_id(soup)
322
+
323
+ # ratings
324
+ editor_r, _ = _parse_overall_rating(soup.find(class_="jrOverallEditor"))
325
+ user_r, user_cnt = _parse_overall_rating(soup.find(class_="jrOverallUser"))
326
+
327
+ # fields — deduplicated by label
328
+ seen: dict = {}
329
+ for row in soup.find_all(class_="jrFieldRow"):
330
+ lbl = row.find(class_="jrFieldLabel")
331
+ val = row.find(class_="jrFieldValue")
332
+ if lbl and val:
333
+ key = lbl.get_text(strip=True).lower()
334
+ if key not in seen:
335
+ seen[key] = val
336
+
337
+ known: dict = {}
338
+ extra: dict = {}
339
+ for raw_key, val_el in seen.items():
340
+ mapped = _DETAIL_FIELD_MAP.get(raw_key)
341
+ text = val_el.get_text(" ", strip=True)
342
+ if mapped:
343
+ known[mapped] = text
344
+ elif raw_key == "genre:":
345
+ known["genre"] = [a.get_text(strip=True) for a in val_el.find_all("a")] or [text]
346
+ elif raw_key == "franchise:":
347
+ known["franchise"] = [a.get_text(strip=True) for a in val_el.find_all("a")] or [text]
348
+ else:
349
+ extra[raw_key] = text
350
+
351
+ editor_reviews = _parse_reviews(soup, "jrEditorReviewsContainer")
352
+ user_reviews = _parse_reviews(soup, "jrUserReviewsContainer")
353
+
354
+ return FaneditDetail(
355
+ fanedit_id=fanedit_id,
356
+ slug=slug,
357
+ title=title,
358
+ url=url,
359
+ cover_url=cover_url,
360
+ imdb_id=imdb_id,
361
+ editor_rating=editor_r,
362
+ user_rating=user_r,
363
+ user_rating_count=user_cnt,
364
+ editor_reviews=editor_reviews,
365
+ user_reviews=user_reviews,
366
+ extra_fields=extra,
367
+ **known,
368
+ )
369
+
370
+
371
+ # ---------------------------------------------------------------------------
372
+ # Reviewer leaderboard
373
+ # ---------------------------------------------------------------------------
374
+
375
+ def parse_reviewer_rank_page(html: str) -> Tuple[List[ReviewerEntry], Optional[str]]:
376
+ """Parse one page of /reviewer-rank/. Returns (entries, next_page_url)."""
377
+ soup = BeautifulSoup(html, "html.parser")
378
+ entries: List[ReviewerEntry] = []
379
+
380
+ for row in soup.find_all(class_="jrRow"):
381
+ if "jrDataListHeader" in (row.get("class") or []):
382
+ continue
383
+
384
+ rank_col = row.find(class_="jrCenterAlign")
385
+ if rank_col is None:
386
+ continue
387
+ user_id_m = re.match(r"user-(\d+)", rank_col.get("id", ""))
388
+ if not user_id_m:
389
+ continue
390
+ user_id = int(user_id_m.group(1))
391
+ try:
392
+ rank = int(rank_col.get_text(strip=True))
393
+ except ValueError:
394
+ continue
395
+
396
+ # username + profile URL
397
+ author_el = row.find(class_="jrReviewAuthor")
398
+ if author_el is None:
399
+ continue
400
+ a = author_el.find("a")
401
+ username = a.get_text(strip=True) if a else ""
402
+ profile_url = a["href"] if a else ""
403
+
404
+ # review count + helpful votes
405
+ content = row.find(class_="jrRankContent")
406
+ review_count = 0
407
+ helpful_yes = None
408
+ helpful_pct = None
409
+ if content:
410
+ rev_a = content.find("a")
411
+ if rev_a:
412
+ reviews_url = rev_a["href"]
413
+ m = re.search(r"(\d+)", rev_a.get_text())
414
+ if m:
415
+ review_count = int(m.group(1))
416
+ else:
417
+ reviews_url = f"https://fanedit.org/my-reviews/{user_id}/"
418
+ text = content.get_text(" ", strip=True)
419
+ hm = re.search(r"Helpful votes:\s*(\d+)\s*\(([0-9.]+)%\)", text)
420
+ if hm:
421
+ helpful_yes = int(hm.group(1))
422
+ helpful_pct = float(hm.group(2))
423
+ else:
424
+ reviews_url = f"https://fanedit.org/my-reviews/{user_id}/"
425
+
426
+ entries.append(ReviewerEntry(
427
+ rank=rank,
428
+ user_id=user_id,
429
+ username=username,
430
+ profile_url=profile_url,
431
+ reviews_url=reviews_url,
432
+ review_count=review_count,
433
+ helpful_yes=helpful_yes,
434
+ helpful_pct=helpful_pct,
435
+ ))
436
+
437
+ next_url: Optional[str] = None
438
+ pagenav = soup.find(class_="jrPagination")
439
+ if pagenav:
440
+ current = pagenav.find(class_="jrPageCurrent")
441
+ if current:
442
+ nxt = current.find_next_sibling("a")
443
+ if nxt and nxt.get("href"):
444
+ next_url = nxt["href"]
445
+
446
+ return entries, next_url
447
+
448
+
449
+ # ---------------------------------------------------------------------------
450
+ # Reviews by user
451
+ # ---------------------------------------------------------------------------
452
+
453
+ def parse_user_reviews_page(html: str) -> Tuple[List[UserReviewEntry], Optional[str]]:
454
+ """Parse one page of /my-reviews/{user_id}/. Returns (reviews, next_page_url)."""
455
+ soup = BeautifulSoup(html, "html.parser")
456
+ reviews: List[UserReviewEntry] = []
457
+
458
+ for el in soup.find_all(class_="jrReviewListLayout"):
459
+ # fanedit link
460
+ listing_title = el.find(class_="jrListingTitle")
461
+ if listing_title is None:
462
+ continue
463
+ a = listing_title.find("a")
464
+ if a is None:
465
+ continue
466
+ fanedit_url = a["href"]
467
+ if not fanedit_url.startswith("http"):
468
+ fanedit_url = "https://fanedit.org" + fanedit_url
469
+ fanedit_title = a.get_text(strip=True)
470
+
471
+ fanedit_type: Optional[str] = None
472
+ cat_el = el.find(class_="jrListingCategory")
473
+ if cat_el:
474
+ fanedit_type = cat_el.get_text(strip=True)
475
+
476
+ date: Optional[str] = None
477
+ date_el = el.find(class_="jrReviewCreated")
478
+ if date_el:
479
+ date = date_el.get("datetime") or date_el.get_text(strip=True)
480
+
481
+ rating_table = el.find(class_="jrRatingTable")
482
+ ratings = _parse_review_ratings(rating_table) if rating_table else ReviewRatings()
483
+
484
+ discussion_url: Optional[str] = None
485
+ comment_count: Optional[int] = None
486
+ for btn_a in el.find_all("a", class_="jrButton"):
487
+ href = btn_a.get("href", "")
488
+ if "/discussions/" in href:
489
+ discussion_url = href
490
+ m = re.search(r"Comments?\s*\((\d+)\)", btn_a.get_text())
491
+ if m:
492
+ comment_count = int(m.group(1))
493
+
494
+ reviews.append(UserReviewEntry(
495
+ fanedit_title=fanedit_title,
496
+ fanedit_url=fanedit_url,
497
+ fanedit_type=fanedit_type,
498
+ date=date,
499
+ ratings=ratings,
500
+ discussion_url=discussion_url,
501
+ comment_count=comment_count,
502
+ ))
503
+
504
+ next_url: Optional[str] = None
505
+ pagenav = soup.find(class_="jrPagination")
506
+ if pagenav:
507
+ current = pagenav.find(class_="jrPageCurrent")
508
+ if current:
509
+ nxt = current.find_next_sibling("a")
510
+ if nxt and nxt.get("href"):
511
+ next_url = nxt["href"]
512
+
513
+ return reviews, next_url
514
+
515
+
516
+ # ---------------------------------------------------------------------------
517
+ # News
518
+ # ---------------------------------------------------------------------------
519
+
520
+ def _thread_id_from_card(card: Tag) -> Optional[int]:
521
+ for cls in card.get("class", []):
522
+ m = re.match(r"js-threadListItem-(\d+)", cls)
523
+ if m:
524
+ return int(m.group(1))
525
+ return None
526
+
527
+
528
+ def parse_news_listing(html: str) -> List[NewsArticle]:
529
+ """Parse the news front page (/forums/news-publisher/)."""
530
+ soup = BeautifulSoup(html, "html.parser")
531
+ articles: List[NewsArticle] = []
532
+
533
+ for card in soup.find_all(class_="newsCard-grid-item"):
534
+ thread_id = _thread_id_from_card(card)
535
+ if thread_id is None:
536
+ continue
537
+
538
+ title_el = card.find(class_="newsCard-grid-title")
539
+ if title_el is None:
540
+ continue
541
+ a = title_el.find("a")
542
+ if a is None:
543
+ continue
544
+ title = a.get_text(strip=True)
545
+ href = a["href"]
546
+ url = href if href.startswith("http") else "https://fanedit.org" + href
547
+
548
+ thumbnail_url: Optional[str] = None
549
+ img = card.find("img", class_="newsCard-grid-image-link")
550
+ if img:
551
+ thumbnail_url = img.get("src")
552
+
553
+ author: Optional[str] = None
554
+ author_user_id: Optional[int] = None
555
+ avatar_a = card.find("a", attrs={"data-user-id": True})
556
+ if avatar_a:
557
+ author_img = avatar_a.find("img")
558
+ if author_img:
559
+ author = author_img.get("alt")
560
+ try:
561
+ author_user_id = int(avatar_a["data-user-id"])
562
+ except (ValueError, KeyError):
563
+ pass
564
+
565
+ published_at: Optional[str] = None
566
+ time_el = card.find("time")
567
+ if time_el:
568
+ published_at = time_el.get("datetime")
569
+
570
+ reading_time: Optional[str] = None
571
+ for li in card.find_all("li", class_="newsCard-date"):
572
+ text = li.get_text(strip=True)
573
+ if "min read" in text:
574
+ # strip SVG title prefix ("Reading time2 min read" → "2 min read")
575
+ m = re.search(r"(\d+\s*min read)", text)
576
+ reading_time = m.group(1) if m else text
577
+
578
+ articles.append(NewsArticle(
579
+ thread_id=thread_id,
580
+ title=title,
581
+ url=url,
582
+ thumbnail_url=thumbnail_url,
583
+ author=author,
584
+ author_user_id=author_user_id,
585
+ published_at=published_at,
586
+ reading_time=reading_time,
587
+ ))
588
+
589
+ return articles
590
+
591
+
592
+ def parse_news_article(html: str, url: str) -> NewsArticle:
593
+ """Parse a single news article page."""
594
+ soup = BeautifulSoup(html, "html.parser")
595
+
596
+ # thread ID from article data attr
597
+ article_el = soup.find("article", class_="newsBody-main")
598
+ thread_id = 0
599
+ if article_el:
600
+ lb_id = article_el.get("data-lb-id", "")
601
+ m = re.search(r"(\d+)", lb_id)
602
+ if m:
603
+ thread_id = int(m.group(1))
604
+
605
+ # title
606
+ title_el = soup.find("h1", class_="p-title-value") or soup.find("h1")
607
+ title = title_el.get_text(strip=True) if title_el else ""
608
+
609
+ # thumbnail
610
+ thumbnail_url: Optional[str] = None
611
+ thumb_img = soup.find("img", class_="newsView-newsThumbnail-header")
612
+ if thumb_img:
613
+ thumbnail_url = thumb_img.get("src")
614
+
615
+ # author + published
616
+ author: Optional[str] = None
617
+ author_user_id: Optional[int] = None
618
+ published_at: Optional[str] = None
619
+ reading_time: Optional[str] = None
620
+ views: Optional[int] = None
621
+
622
+ desc = soup.find(class_="p-description")
623
+ if desc:
624
+ author_a = desc.find("a", attrs={"data-user-id": True})
625
+ if author_a:
626
+ author = author_a.get_text(strip=True)
627
+ try:
628
+ author_user_id = int(author_a["data-user-id"])
629
+ except (ValueError, KeyError):
630
+ pass
631
+ time_el = desc.find("time")
632
+ if time_el:
633
+ published_at = time_el.get("datetime")
634
+ for li in desc.find_all("li"):
635
+ text = li.get_text(strip=True)
636
+ if "min read" in text:
637
+ m = re.search(r"(\d+\s*min read)", text)
638
+ reading_time = m.group(1) if m else text
639
+
640
+ # view count from pairs--justified
641
+ for pair in soup.find_all(class_="pairs--justified"):
642
+ text = pair.get_text(" ", strip=True)
643
+ m = re.search(r"Views\s+([\d,]+)", text)
644
+ if m:
645
+ try:
646
+ views = int(m.group(1).replace(",", ""))
647
+ except ValueError:
648
+ pass
649
+
650
+ # category from breadcrumb
651
+ category: Optional[str] = None
652
+ crumbs = soup.find_all(class_="p-breadcrumbs")
653
+ if not crumbs:
654
+ crumbs = soup.find_all(attrs={"itemprop": "breadcrumb"})
655
+ if crumbs:
656
+ links = crumbs[-1].find_all("a") if crumbs else []
657
+ if links:
658
+ category = links[-1].get_text(strip=True)
659
+
660
+ # body
661
+ body_html: Optional[str] = None
662
+ body_text: Optional[str] = None
663
+ bb = soup.find(class_="bbWrapper")
664
+ if bb:
665
+ body_html = str(bb)
666
+ body_text = bb.get_text(" ", strip=True)
667
+
668
+ # mentioned IFDB fanedit URLs
669
+ mentioned: List[str] = []
670
+ if bb:
671
+ for a in bb.find_all("a", href=True):
672
+ href = a["href"]
673
+ if "fanedit.org" in href and "/forums/" not in href and href not in mentioned:
674
+ mentioned.append(href)
675
+
676
+ return NewsArticle(
677
+ thread_id=thread_id,
678
+ title=title,
679
+ url=url,
680
+ thumbnail_url=thumbnail_url,
681
+ author=author,
682
+ author_user_id=author_user_id,
683
+ published_at=published_at,
684
+ reading_time=reading_time,
685
+ views=views,
686
+ category=category,
687
+ body_html=body_html,
688
+ body_text=body_text,
689
+ mentioned_fanedit_urls=mentioned,
690
+ )
@@ -0,0 +1,84 @@
1
+ """HTTP layer using curl_cffi to bypass TLS fingerprinting."""
2
+ from __future__ import annotations
3
+
4
+ import time
5
+ from threading import Lock
6
+ from typing import Any, Dict, Mapping, Optional, Tuple
7
+
8
+ from curl_cffi import requests
9
+
10
+ SITE_URL = "https://fanedit.org"
11
+ _CacheKey = Tuple[str, str, Tuple[Tuple[str, str], ...]]
12
+
13
+
14
+ class Session:
15
+ def __init__(
16
+ self,
17
+ impersonate: str = "chrome120",
18
+ cache_ttl: float = 300.0,
19
+ cache_size: int = 512,
20
+ ) -> None:
21
+ self._session = requests.Session(impersonate=impersonate)
22
+ self.cache_ttl = cache_ttl
23
+ self.cache_size = cache_size
24
+ self._cache: Dict[_CacheKey, Tuple[float, str]] = {}
25
+ self._lock = Lock()
26
+
27
+ def _key(self, url: str, params: Optional[Mapping[str, Any]]) -> _CacheKey:
28
+ items = tuple(sorted((k, str(v)) for k, v in (params or {}).items()))
29
+ return ("GET", url, items)
30
+
31
+ def _cache_get(self, key: _CacheKey) -> Optional[str]:
32
+ with self._lock:
33
+ hit = self._cache.get(key)
34
+ if hit is None:
35
+ return None
36
+ ts, text = hit
37
+ if self.cache_ttl and (time.time() - ts) > self.cache_ttl:
38
+ with self._lock:
39
+ self._cache.pop(key, None)
40
+ return None
41
+ return text
42
+
43
+ def _cache_put(self, key: _CacheKey, text: str) -> None:
44
+ if not self.cache_size:
45
+ return
46
+ with self._lock:
47
+ if len(self._cache) >= self.cache_size and self._cache:
48
+ oldest = min(self._cache.items(), key=lambda kv: kv[1][0])[0]
49
+ self._cache.pop(oldest, None)
50
+ self._cache[key] = (time.time(), text)
51
+
52
+ def get(
53
+ self,
54
+ path: str,
55
+ params: Optional[Mapping[str, Any]] = None,
56
+ use_cache: bool = True,
57
+ ) -> str:
58
+ url = path if path.startswith("http") else SITE_URL + "/" + path.lstrip("/")
59
+ key = self._key(url, params)
60
+ if use_cache:
61
+ cached = self._cache_get(key)
62
+ if cached is not None:
63
+ return cached
64
+ r = self._session.get(url, params=params, headers={
65
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
66
+ "Accept-Language": "en-US,en;q=0.5",
67
+ })
68
+ r.raise_for_status()
69
+ if use_cache:
70
+ self._cache_put(key, r.text)
71
+ return r.text
72
+
73
+ def post(self, path: str, data: Optional[Mapping[str, Any]] = None) -> str:
74
+ url = path if path.startswith("http") else SITE_URL + "/" + path.lstrip("/")
75
+ r = self._session.post(url, data=data, headers={
76
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
77
+ "Accept-Language": "en-US,en;q=0.5",
78
+ "Referer": SITE_URL + "/fanedit-search/",
79
+ })
80
+ r.raise_for_status()
81
+ return r.text
82
+
83
+
84
+ default_session = Session()
@@ -0,0 +1,8 @@
1
+ # START_VERSION_BLOCK
2
+ VERSION_MAJOR = 0
3
+ VERSION_MINOR = 1
4
+ VERSION_BUILD = 1
5
+ VERSION_ALPHA = 1
6
+ # END_VERSION_BLOCK
7
+
8
+ __version__ = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_BUILD}" + (f"a{VERSION_ALPHA}" if VERSION_ALPHA else "")
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyfanedit
3
+ Version: 0.1.1a1
4
+ Summary: Scraping client for fanedit.org IFDB
5
+ Author-email: JarbasAi <jarbasai@mailfence.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/OpenJarbas/pyfanedit
8
+ Requires-Python: >=3.8
9
+ Requires-Dist: beautifulsoup4
10
+ Requires-Dist: curl_cffi
11
+ Requires-Dist: pydantic>=2.0
12
+ Provides-Extra: test
13
+ Requires-Dist: pytest; extra == "test"
@@ -0,0 +1,13 @@
1
+ README.md
2
+ pyproject.toml
3
+ pyfanedit/__init__.py
4
+ pyfanedit/client.py
5
+ pyfanedit/models.py
6
+ pyfanedit/parsers.py
7
+ pyfanedit/session.py
8
+ pyfanedit/version.py
9
+ pyfanedit.egg-info/PKG-INFO
10
+ pyfanedit.egg-info/SOURCES.txt
11
+ pyfanedit.egg-info/dependency_links.txt
12
+ pyfanedit.egg-info/requires.txt
13
+ pyfanedit.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ beautifulsoup4
2
+ curl_cffi
3
+ pydantic>=2.0
4
+
5
+ [test]
6
+ pytest
@@ -0,0 +1 @@
1
+ pyfanedit
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pyfanedit"
7
+ dynamic = ["version"]
8
+ description = "Scraping client for fanedit.org IFDB"
9
+ license = { text = "Apache-2.0" }
10
+ authors = [{ name = "JarbasAi", email = "jarbasai@mailfence.com" }]
11
+ requires-python = ">=3.8"
12
+ dependencies = ["beautifulsoup4", "curl_cffi", "pydantic>=2.0"]
13
+
14
+ [project.optional-dependencies]
15
+ test = ["pytest"]
16
+
17
+ [project.urls]
18
+ Homepage = "https://github.com/OpenJarbas/pyfanedit"
19
+
20
+ [tool.setuptools.dynamic]
21
+ version = { attr = "pyfanedit.version.__version__" }
22
+
23
+ [tool.setuptools.packages.find]
24
+ include = ["pyfanedit*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+