dvdcompare-scraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: dvdcompare-scraper
3
+ Version: 0.1.0
4
+ Summary: Scrape disc extras metadata from dvdcompare.net
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: httpx>=0.27
7
+ Requires-Dist: beautifulsoup4>=4.12
8
+ Provides-Extra: dev
9
+ Requires-Dist: pytest>=8.0; extra == "dev"
10
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
@@ -0,0 +1,80 @@
1
+ # dvdcompare-scraper
2
+
3
+ Scrape disc extras metadata from [dvdcompare.net](https://www.dvdcompare.net).
4
+
5
+ ## Install
6
+
7
+ ```
8
+ pip install -e ".[dev]"
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ Search by title:
14
+
15
+ ```
16
+ dvdcompare "Oppenheimer"
17
+ ```
18
+
19
+ Look up by dvdcompare film ID:
20
+
21
+ ```
22
+ dvdcompare --id 66397
23
+ ```
24
+
25
+ Look up by URL:
26
+
27
+ ```
28
+ dvdcompare --url "https://www.dvdcompare.net/comparisons/film.php?fid=66397"
29
+ ```
30
+
31
+ ### Regional releases
32
+
33
+ Each dvdcompare page lists multiple regional releases (e.g. America, United Kingdom, Japan), each with its own disc contents and runtimes. By default, the CLI shows only the first release listed.
34
+
35
+ - `--release` selects a release by position (1-based) or by name keyword (case-insensitive substring match):
36
+ ```
37
+ dvdcompare --id 67210 --release 2
38
+ dvdcompare --id 67210 --release america
39
+ dvdcompare --id 67210 --release "united kingdom"
40
+ ```
41
+ If no release matches the keyword, the available release names are printed so you can retry.
42
+ - `--all-releases` shows every release:
43
+ ```
44
+ dvdcompare --id 67210 --all-releases
45
+ ```
46
+ - `--json` outputs the data structure (respects `--release` filtering):
47
+ ```
48
+ dvdcompare --id 67210 --json
49
+ dvdcompare --id 67210 --release america --json
50
+ ```
51
+
52
+ ### Filtering with external tools
53
+
54
+ For more complex filtering, pipe the JSON output through jq or PowerShell:
55
+
56
+ **jq:**
57
+ ```bash
58
+ dvdcompare --id 67210 --json | jq '.releases |= map(select(.name | test("america"; "i")))'
59
+ ```
60
+
61
+ **PowerShell:**
62
+ ```powershell
63
+ dvdcompare --id 67210 --json | ConvertFrom-Json | ForEach-Object {
64
+ $_.releases = $_.releases | Where-Object { $_.name -match "america" }
65
+ $_ | ConvertTo-Json -Depth 10
66
+ }
67
+ ```
68
+
69
+ ## Data model
70
+
71
+ - `FilmComparison`: top-level object with title, year, format, director, IMDB info, and a list of `Release` objects.
72
+ - `Release`: a regional release with name (e.g. "Blu-ray ALL America - BBC"), year, and a list of `Disc` objects.
73
+ - `Disc`: a single disc with number, format (e.g. "Blu-ray 4K"), and a list of `Feature` objects.
74
+ - `Feature`: a bonus feature with title, runtime, type, year, technical notes, play-all flag, and optional children (for grouped features like "Making Of" collections or episode groups).
75
+
76
+ ## Tests
77
+
78
+ ```
79
+ py -m pytest tests/ -v
80
+ ```
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dvdcompare-scraper"
7
+ version = "0.1.0"
8
+ description = "Scrape disc extras metadata from dvdcompare.net"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "httpx>=0.27",
12
+ "beautifulsoup4>=4.12",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ dev = [
17
+ "pytest>=8.0",
18
+ "pytest-asyncio>=0.23",
19
+ ]
20
+
21
+ [project.scripts]
22
+ dvdcompare = "dvdcompare.cli:main"
23
+
24
+ [tool.setuptools.packages.find]
25
+ where = ["src"]
26
+
27
+ [tool.pytest.ini_options]
28
+ asyncio_mode = "auto"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import sys
7
+ from dataclasses import asdict
8
+
9
+ from .models import Release
10
+ from .parser import format_runtime
11
+ from .scraper import get_film, get_film_by_url, search
12
+
13
+
14
+ def select_releases(releases: list[Release], selector: str) -> list[Release]:
15
+ """Filter releases by 1-based index or case-insensitive name substring.
16
+
17
+ Raises LookupError if no release matches the selector.
18
+ """
19
+ try:
20
+ idx = int(selector) - 1
21
+ idx = min(idx, len(releases) - 1)
22
+ return [releases[idx]]
23
+ except ValueError:
24
+ pass
25
+
26
+ keyword = selector.lower()
27
+ matched = [r for r in releases if keyword in r.name.lower()]
28
+ if matched:
29
+ return matched
30
+
31
+ names = "\n".join(f" {i}. {r.name}" for i, r in enumerate(releases, 1))
32
+ raise LookupError(
33
+ f"No release matching '{selector}'.\nAvailable releases:\n{names}"
34
+ )
35
+
36
+
37
+ def main() -> None:
38
+ parser = argparse.ArgumentParser(
39
+ description="Scrape disc extras metadata from dvdcompare.net",
40
+ )
41
+ group = parser.add_mutually_exclusive_group(required=True)
42
+ group.add_argument("query", nargs="?", help="Search query (film title)")
43
+ group.add_argument("--id", type=int, dest="film_id", help="dvdcompare film ID")
44
+ group.add_argument("--url", help="dvdcompare film page URL")
45
+
46
+ parser.add_argument(
47
+ "--release",
48
+ default="1",
49
+ help="Release number (1-based) or name keyword (default: 1)",
50
+ )
51
+ parser.add_argument(
52
+ "--all-releases",
53
+ action="store_true",
54
+ help="Show all releases instead of just one",
55
+ )
56
+ parser.add_argument("--json", action="store_true", help="Output as JSON")
57
+
58
+ args = parser.parse_args()
59
+ asyncio.run(_run(args))
60
+
61
+
62
+ async def _run(args: argparse.Namespace) -> None:
63
+ if args.film_id:
64
+ film = await get_film(args.film_id)
65
+ elif args.url:
66
+ film = await get_film_by_url(args.url)
67
+ else:
68
+ results = await search(args.query)
69
+ if not results:
70
+ print("No results found.", file=sys.stderr)
71
+ sys.exit(1)
72
+ if len(results) > 1:
73
+ print(f"Found {len(results)} results:", file=sys.stderr)
74
+ for i, r in enumerate(results, 1):
75
+ print(f" {i}. {r.title} (fid={r.film_id})", file=sys.stderr)
76
+ print(
77
+ "Using first result. Use --id to select a specific one.",
78
+ file=sys.stderr,
79
+ )
80
+ film = await get_film(results[0].film_id)
81
+
82
+ if not args.all_releases and film.releases:
83
+ film.releases = select_releases(film.releases, args.release)
84
+
85
+ if args.json:
86
+ print(json.dumps(asdict(film), indent=2))
87
+ else:
88
+ _print_text(film, args)
89
+
90
+
91
+ def _print_text(film, args: argparse.Namespace) -> None:
92
+ header = film.title
93
+ if film.format:
94
+ header += f" ({film.format})"
95
+ if film.year:
96
+ header += f" ({film.year})"
97
+ print(header)
98
+ if film.director:
99
+ print(f"Director: {film.director}")
100
+ if film.imdb_id:
101
+ print(f"IMDB: {film.imdb_id}")
102
+ print()
103
+
104
+ for release in film.releases:
105
+ line = f"--- {release.name}"
106
+ if release.year:
107
+ line += f" [{release.year}]"
108
+ line += " ---"
109
+ print(line)
110
+
111
+ for disc in release.discs:
112
+ print(f"\n DISC {disc.number} ({disc.format})")
113
+ if disc.is_film:
114
+ print(" The Film")
115
+ for feature in disc.features:
116
+ _print_feature(feature, indent=4)
117
+ print()
118
+
119
+
120
+ def _print_feature(feature, indent: int = 4) -> None:
121
+ prefix = " " * indent
122
+ parts = [f'{prefix}"{feature.title}"']
123
+ if feature.year:
124
+ parts.append(str(feature.year))
125
+ if feature.feature_type:
126
+ parts.append(feature.feature_type)
127
+ if feature.runtime_seconds is not None:
128
+ rt = format_runtime(feature.runtime_seconds)
129
+ if feature.is_play_all:
130
+ parts.append(f"(Play All - {rt})")
131
+ else:
132
+ parts.append(f"({rt})")
133
+ if feature.technical_notes:
134
+ parts.append(f"[{feature.technical_notes}]")
135
+ print(" ".join(parts))
136
+
137
+ for child in feature.children:
138
+ _print_feature(child, indent=indent + 2)
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class Feature:
8
+ """A single bonus feature on a disc."""
9
+
10
+ title: str
11
+ runtime_seconds: int | None = None
12
+ feature_type: str | None = None
13
+ year: int | None = None
14
+ technical_notes: str | None = None
15
+ is_play_all: bool = False
16
+ children: list[Feature] = field(default_factory=list)
17
+
18
+
19
+ @dataclass
20
+ class Disc:
21
+ """A single disc in a release."""
22
+
23
+ number: int
24
+ format: str
25
+ is_film: bool = False
26
+ features: list[Feature] = field(default_factory=list)
27
+
28
+
29
+ @dataclass
30
+ class Release:
31
+ """A regional release of a film."""
32
+
33
+ name: str
34
+ year: int | None = None
35
+ discs: list[Disc] = field(default_factory=list)
36
+
37
+
38
+ @dataclass
39
+ class FilmComparison:
40
+ """A complete film comparison page from dvdcompare.net."""
41
+
42
+ title: str
43
+ year: int | None = None
44
+ format: str | None = None
45
+ director: str | None = None
46
+ imdb_url: str | None = None
47
+ imdb_id: str | None = None
48
+ film_id: int | None = None
49
+ releases: list[Release] = field(default_factory=list)
50
+
51
+
52
+ @dataclass
53
+ class SearchResult:
54
+ """A single result from a dvdcompare.net search."""
55
+
56
+ title: str
57
+ url: str
58
+ film_id: int | None = None
@@ -0,0 +1,351 @@
1
+ from __future__ import annotations
2
+
3
+ import html as html_module
4
+ import re
5
+
6
+ from bs4 import BeautifulSoup
7
+
8
+ from .models import Disc, Feature, FilmComparison, Release, SearchResult
9
+
10
+ _DISC_WORDS = {
11
+ "ONE": 1,
12
+ "TWO": 2,
13
+ "THREE": 3,
14
+ "FOUR": 4,
15
+ "FIVE": 5,
16
+ "SIX": 6,
17
+ "SEVEN": 7,
18
+ "EIGHT": 8,
19
+ "NINE": 9,
20
+ "TEN": 10,
21
+ }
22
+
23
+
24
+ def parse_runtime(s: str) -> int:
25
+ """Parse ``MM:SS``, ``H:MM:SS``, or ``NNN mins`` into total seconds."""
26
+ s = s.strip()
27
+ mins_match = re.match(r"^(\d+)\s*mins?$", s)
28
+ if mins_match:
29
+ return int(mins_match.group(1)) * 60
30
+ parts = s.split(":")
31
+ if len(parts) == 2:
32
+ return int(parts[0]) * 60 + int(parts[1])
33
+ if len(parts) == 3:
34
+ return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
35
+ return 0
36
+
37
+
38
+ def format_runtime(seconds: int) -> str:
39
+ """Format total seconds as ``MM:SS`` or ``H:MM:SS``."""
40
+ mins, secs = divmod(seconds, 60)
41
+ hours, mins = divmod(mins, 60)
42
+ if hours:
43
+ return f"{hours}:{mins:02d}:{secs:02d}"
44
+ return f"{mins}:{secs:02d}"
45
+
46
+
47
+ def _disc_number(word: str) -> int:
48
+ word = word.upper()
49
+ if word in _DISC_WORDS:
50
+ return _DISC_WORDS[word]
51
+ try:
52
+ return int(word)
53
+ except ValueError:
54
+ return 0
55
+
56
+
57
+ def parse_feature_line(line: str) -> Feature:
58
+ """Parse a single feature text line into a :class:`Feature`."""
59
+ technical_notes = None
60
+ is_play_all = False
61
+ runtime_seconds = None
62
+ year = None
63
+ feature_type = None
64
+
65
+ # 1. Extract technical notes [...]
66
+ tech_match = re.search(r"\s*\[([^\]]+)\]", line)
67
+ if tech_match:
68
+ technical_notes = tech_match.group(1)
69
+ line = line[: tech_match.start()] + line[tech_match.end() :]
70
+
71
+ # 2. Strip trailing colon (group indicator) -- detected by caller
72
+ line = line.strip()
73
+ if line.endswith(":"):
74
+ line = line[:-1].strip()
75
+
76
+ # 3. Extract runtime (MM:SS) / (H:MM:SS) / (Play All - MM:SS) / (NNN mins)
77
+ # Also handles "with Play All option - MM:SS"
78
+ runtime_match = re.search(
79
+ r"\((?:(?:with )?(Play All)(?: option)? - )?(\d{1,3}:\d{2}(?::\d{2})?|\d+\s*mins?)\)",
80
+ line,
81
+ )
82
+ if runtime_match:
83
+ if runtime_match.group(1):
84
+ is_play_all = True
85
+ runtime_seconds = parse_runtime(runtime_match.group(2))
86
+ line = line[: runtime_match.start()] + line[runtime_match.end() :]
87
+ line = line.strip()
88
+
89
+ # 3b. Strip trailing colon again (may be exposed after runtime removal)
90
+ if line.endswith(":"):
91
+ line = line[:-1].strip()
92
+
93
+ # 4. Extract quoted title vs unquoted
94
+ quote_match = re.match(r'^["\u201c](.+?)["\u201d](.*)$', line)
95
+ if quote_match:
96
+ title = quote_match.group(1).strip()
97
+ rest = quote_match.group(2).strip()
98
+ else:
99
+ title = line.strip()
100
+ rest = ""
101
+
102
+ # 5. From rest, extract year and type
103
+ if rest:
104
+ year_match = re.match(r"^(\d{4})\s*(.*)", rest)
105
+ if year_match:
106
+ year = int(year_match.group(1))
107
+ feature_type = year_match.group(2).strip() or None
108
+ else:
109
+ feature_type = rest.strip() or None
110
+
111
+ # Normalize whitespace in title
112
+ title = re.sub(r"\s+", " ", title).strip()
113
+
114
+ return Feature(
115
+ title=title,
116
+ runtime_seconds=runtime_seconds,
117
+ feature_type=feature_type,
118
+ year=year,
119
+ technical_notes=technical_notes,
120
+ is_play_all=is_play_all,
121
+ )
122
+
123
+
124
+ def parse_extras(extras_html: str) -> list[Disc]:
125
+ """Parse the inner HTML of an extras description div into :class:`Disc` objects."""
126
+ # Replace <br> variants with newlines
127
+ text = re.sub(r"<br\s*/?>", "\n", extras_html)
128
+ # Remove all remaining HTML tags
129
+ text = re.sub(r"<[^>]+>", "", text)
130
+ # Decode HTML entities
131
+ text = html_module.unescape(text)
132
+
133
+ lines = [ln.strip() for ln in text.split("\n")]
134
+
135
+ discs: list[Disc] = []
136
+ current_disc: Disc | None = None
137
+ current_group: Feature | None = None
138
+
139
+ for line in lines:
140
+ if not line:
141
+ continue
142
+
143
+ # Disc header: DISC ONE (Blu-ray 4K) or DISC ONE
144
+ disc_match = re.match(r"^DISC\s+(\w+)(?:\s+\((.+)\))?$", line)
145
+ if disc_match:
146
+ current_disc = Disc(
147
+ number=_disc_number(disc_match.group(1)),
148
+ format=disc_match.group(2) or "",
149
+ )
150
+ discs.append(current_disc)
151
+ current_group = None
152
+ continue
153
+
154
+ # "* The Film" marker (possibly with a variant title suffix).
155
+ # dvdcompare uses a leading asterisk to flag the main feature.
156
+ if line.startswith("*"):
157
+ stripped = line.lstrip("*").strip()
158
+ is_film_marker = (
159
+ not stripped # bare "*"
160
+ or stripped.lower().startswith("the film")
161
+ )
162
+ if is_film_marker and current_disc:
163
+ current_disc.is_film = True
164
+ # Exact "* The Film" (no extra info) -> skip entirely
165
+ if not stripped or stripped.lower() == "the film":
166
+ continue
167
+ # Otherwise strip the asterisk and keep as a feature
168
+ line = stripped
169
+ # fall through to feature parsing below
170
+
171
+ if current_disc is None:
172
+ continue
173
+
174
+ # Sub-feature (starts with "- ")
175
+ if line.startswith("- "):
176
+ feature = parse_feature_line(line[2:])
177
+ if current_group:
178
+ current_group.children.append(feature)
179
+ else:
180
+ current_disc.features.append(feature)
181
+ continue
182
+
183
+ feature = parse_feature_line(line)
184
+ current_disc.features.append(feature)
185
+
186
+ # Detect group header (trailing colon or play-all)
187
+ is_group = line.rstrip().endswith(":") or feature.is_play_all
188
+
189
+ if is_group:
190
+ current_group = feature
191
+ else:
192
+ current_group = None
193
+
194
+ return discs
195
+
196
+
197
+ def parse_film_page(html: str) -> FilmComparison:
198
+ """Parse a dvdcompare.net film comparison page into a :class:`FilmComparison`."""
199
+ soup = BeautifulSoup(html, "html.parser")
200
+
201
+ # --- Title, format, year from <h2> ---
202
+ title = ""
203
+ year = None
204
+ disc_format = None
205
+ h2 = soup.find("h2")
206
+ if h2:
207
+ h2_text = h2.get_text(strip=True)
208
+ # Last (YYYY) is the year
209
+ year_match = re.search(r"\((\d{4})\)\s*$", h2_text)
210
+ if year_match:
211
+ year = int(year_match.group(1))
212
+ rest = h2_text[: year_match.start()].strip()
213
+ else:
214
+ rest = h2_text
215
+ # Format in parens at end of remainder
216
+ fmt_match = re.search(r"\(([^)]+)\)\s*$", rest)
217
+ if fmt_match:
218
+ disc_format = fmt_match.group(1)
219
+ title = rest[: fmt_match.start()].strip()
220
+ else:
221
+ title = rest
222
+
223
+ # --- IMDB ---
224
+ imdb_url = None
225
+ imdb_id = None
226
+ imdb_link = soup.find("a", href=re.compile(r"imdb\.com/title/"))
227
+ if imdb_link:
228
+ imdb_url = imdb_link["href"]
229
+ id_match = re.search(r"/(tt\d+)", imdb_url)
230
+ if id_match:
231
+ imdb_id = id_match.group(1)
232
+
233
+ # --- Director ---
234
+ director = None
235
+ content_div = soup.find("div", id="content")
236
+ if content_div:
237
+ dir_match = re.search(
238
+ r"Director:\s*(.+?)(?:\n|$)", content_div.get_text()
239
+ )
240
+ if dir_match:
241
+ director = dir_match.group(1).strip()
242
+
243
+ # --- Film ID ---
244
+ film_id = None
245
+ form = soup.find("form", action=re.compile(r"film\.php\?fid="))
246
+ if form:
247
+ fid_match = re.search(r"fid=(\d+)", form["action"])
248
+ if fid_match:
249
+ film_id = int(fid_match.group(1))
250
+
251
+ # --- Releases ---
252
+ releases: list[Release] = []
253
+ table = soup.find("table", attrs={"border": "0", "align": "center"})
254
+ if table:
255
+ for tr in table.find_all("tr"):
256
+ ul = tr.find("ul", class_="dvd")
257
+ if not ul:
258
+ continue
259
+
260
+ h3 = ul.find("h3")
261
+ if not h3:
262
+ continue
263
+
264
+ # Release name and year
265
+ release_year = None
266
+ year_span = h3.find("span", class_="disc-release-year")
267
+ if year_span:
268
+ ry_match = re.search(r"\[(\d{4})", year_span.get_text())
269
+ if ry_match:
270
+ release_year = int(ry_match.group(1))
271
+ release_name = h3.get_text(strip=True).replace(
272
+ year_span.get_text(strip=True), ""
273
+ ).strip()
274
+ else:
275
+ release_name = h3.get_text(strip=True)
276
+
277
+ # Find extras
278
+ discs: list[Disc] = []
279
+ for li in ul.find_all("li", recursive=False):
280
+ label_div = li.find("div", class_="label")
281
+ if label_div and "Extras:" in label_div.get_text():
282
+ desc_div = li.find("div", class_="description")
283
+ if desc_div:
284
+ discs = parse_extras(desc_div.decode_contents())
285
+ break
286
+
287
+ releases.append(
288
+ Release(name=release_name, year=release_year, discs=discs)
289
+ )
290
+
291
+ return FilmComparison(
292
+ title=title,
293
+ year=year,
294
+ format=disc_format,
295
+ director=director,
296
+ imdb_url=imdb_url,
297
+ imdb_id=imdb_id,
298
+ film_id=film_id,
299
+ releases=releases,
300
+ )
301
+
302
+
303
+ def parse_search_results(html: str) -> list[SearchResult]:
304
+ """Parse a dvdcompare.net search results page.
305
+
306
+ When the search returns exactly one hit, dvdcompare emits a JavaScript
307
+ redirect (``location.href="film.php?fid=..."``)) instead of a clickable
308
+ ``<a>`` tag. This function handles both cases.
309
+ """
310
+ soup = BeautifulSoup(html, "html.parser")
311
+ results: list[SearchResult] = []
312
+ seen: set[int] = set()
313
+
314
+ for link in soup.find_all("a", href=re.compile(r"film\.php\?fid=\d+")):
315
+ text = link.get_text(strip=True)
316
+ if not text:
317
+ continue
318
+ href = link["href"]
319
+ fid_match = re.search(r"fid=(\d+)", href)
320
+ film_id = int(fid_match.group(1)) if fid_match else None
321
+
322
+ if film_id and film_id in seen:
323
+ continue
324
+ if film_id:
325
+ seen.add(film_id)
326
+
327
+ if not href.startswith("http"):
328
+ href = f"https://www.dvdcompare.net/comparisons/{href}"
329
+
330
+ results.append(SearchResult(title=text, url=href, film_id=film_id))
331
+
332
+ # Single-result pages use a JS redirect instead of <a> links.
333
+ if not results:
334
+ for script in soup.find_all("script"):
335
+ content = script.string or ""
336
+ m = re.search(
337
+ r'location\.href\s*=\s*"(film\.php\?fid=(\d+))"', content
338
+ )
339
+ if m:
340
+ fid = int(m.group(2))
341
+ href = f"https://www.dvdcompare.net/comparisons/{m.group(1)}"
342
+ # Try to grab the title from the <h2> nearby.
343
+ h2 = soup.find("h2")
344
+ title = ""
345
+ if h2:
346
+ italic = h2.find("i")
347
+ title = italic.get_text(strip=True) if italic else h2.get_text(strip=True)
348
+ results.append(SearchResult(title=title, url=href, film_id=fid))
349
+ break
350
+
351
+ return results