rolling-reader 0.6.0__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rolling-reader
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction
5
5
  License: MIT
6
6
  Requires-Python: >=3.11
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "rolling-reader"
7
- version = "0.6.0"
7
+ version = "0.6.2"
8
8
  description = "Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction"
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -17,6 +17,7 @@ import asyncio
17
17
  import json
18
18
  import sys
19
19
  from enum import Enum
20
+ from pathlib import Path
20
21
  from typing import Optional
21
22
 
22
23
  import typer
@@ -267,31 +268,42 @@ async def _run_batch(
267
268
  @app.command(name="chrome")
268
269
  def launch_chrome(
269
270
  port: int = typer.Option(9222, "--port", "-p", help="Remote debugging port (default: 9222)"),
271
+ fresh: bool = typer.Option(False, "--fresh", help="Use a clean profile instead of your real Chrome profile"),
270
272
  ) -> None:
271
- """Launch Chrome with remote debugging enabled.
273
+ """Launch Chrome with remote debugging, using your real login session.
272
274
 
273
- Finds Chrome automatically and starts it in the background.
274
- After running this, Level 2/3 scraping works immediately.
275
+ Chrome must be fully closed before running this command.
276
+ Your existing logins, cookies, and history are preserved.
275
277
 
276
278
  Example:
277
279
 
278
- rr chrome
279
- rr https://app.example.com/dashboard
280
+ rr chrome # close Chrome first, then run this
281
+ rr https://twitter.com/home # scrapes with your login
280
282
  """
281
283
  import subprocess
282
284
  import platform
283
-
284
285
  import asyncio
285
286
  import time
286
287
  import os
287
- import tempfile
288
288
 
289
- # 先检查端口是否已经在用(Chrome 已经以调试模式运行)
289
+ # 先检查端口是否已经在用(已有调试模式 Chrome
290
290
  if asyncio.run(_check_cdp(port)):
291
291
  typer.echo(f"Chrome is already running with remote debugging on port {port}.")
292
292
  typer.echo("Ready — run: rr <url>")
293
293
  return
294
294
 
295
+ # 检查是否有 Chrome 后台进程,有的话先杀掉
296
+ # (Chrome 关窗口后仍可能有后台进程,会阻止新实例开调试端口)
297
+ if _chrome_is_running():
298
+ typer.echo("Closing Chrome background processes...", err=True)
299
+ _kill_chrome()
300
+ # 等进程完全退出(最多 10 秒,比固定 3 秒更可靠)
301
+ deadline_kill = time.time() + 10
302
+ while time.time() < deadline_kill:
303
+ time.sleep(0.5)
304
+ if not _chrome_is_running():
305
+ break
306
+
295
307
  exe = _find_chrome()
296
308
  if exe is None:
297
309
  typer.echo(
@@ -301,36 +313,50 @@ def launch_chrome(
301
313
  )
302
314
  raise typer.Exit(code=1)
303
315
 
304
- # 用独立的 user-data-dir,避免被已有 Chrome 进程吞掉
305
- debug_profile = os.path.join(tempfile.gettempdir(), "rolling-reader-chrome")
306
- os.makedirs(debug_profile, exist_ok=True)
316
+ # rolling-reader 专用的持久化 profile 目录
317
+ # 每次 rr chrome 都用同一个目录 → 登录状态永久保留
318
+ profile_dir = os.path.join(Path.home(), ".rolling-reader", "chrome-profile")
319
+ os.makedirs(profile_dir, exist_ok=True)
320
+
321
+ if fresh:
322
+ import shutil
323
+ shutil.rmtree(profile_dir, ignore_errors=True)
324
+ os.makedirs(profile_dir, exist_ok=True)
325
+ typer.echo("Starting with a fresh profile (login state cleared).", err=True)
326
+ else:
327
+ # 从真实 Chrome profile 同步 cookies(Chrome 已关闭,安全复制)
328
+ synced = _sync_cookies_from_real_profile(str(profile_dir))
329
+ if synced:
330
+ typer.echo("Synced cookies from your Chrome profile.", err=True)
331
+ else:
332
+ first_time = not any(os.scandir(profile_dir))
333
+ if first_time:
334
+ typer.echo(
335
+ "First run: a new browser profile will open.\n"
336
+ "Log in to any sites you need — your login state will be saved for next time.",
337
+ err=True,
338
+ )
339
+ else:
340
+ typer.echo("Using saved profile (your logins are preserved).", err=True)
307
341
 
308
342
  args = [
309
343
  exe,
310
344
  f"--remote-debugging-port={port}",
311
- f"--user-data-dir={debug_profile}",
345
+ f"--user-data-dir={profile_dir}",
312
346
  "--remote-allow-origins=*",
313
347
  "--no-first-run",
314
348
  "--no-default-browser-check",
315
349
  ]
316
350
 
317
- system = platform.system()
318
351
  try:
319
- if system == "Windows":
320
- subprocess.Popen(
321
- args,
322
- creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP,
323
- close_fds=True,
324
- )
325
- else:
326
- subprocess.Popen(args, start_new_session=True, close_fds=True)
352
+ subprocess.Popen(args, close_fds=True)
327
353
  except Exception as e:
328
354
  typer.echo(f"Error: failed to launch Chrome: {e}", err=True)
329
355
  raise typer.Exit(code=1)
330
356
 
331
- # 等待 Chrome 初始化调试端口(最多 8 秒)
357
+ # 等待 Chrome 初始化(最多 15 秒)
332
358
  typer.echo("Starting Chrome...", err=True)
333
- deadline = time.time() + 8
359
+ deadline = time.time() + 15
334
360
  while time.time() < deadline:
335
361
  time.sleep(0.5)
336
362
  if asyncio.run(_check_cdp(port)):
@@ -339,8 +365,8 @@ def launch_chrome(
339
365
  return
340
366
 
341
367
  typer.echo(
342
- f"Warning: Chrome launched but port {port} is not responding yet.\n"
343
- "Wait a moment and try your rr command — it may still be starting up.",
368
+ "Chrome is starting if it doesn't respond, make sure all Chrome windows were closed first.\n"
369
+ "Then retry: rr chrome",
344
370
  err=True,
345
371
  )
346
372
 
@@ -356,6 +382,89 @@ async def _check_cdp(port: int) -> bool:
356
382
  return False
357
383
 
358
384
 
385
+ def _chrome_is_running() -> bool:
386
+ """检查是否有 Chrome 进程在运行。"""
387
+ import subprocess, platform
388
+ if platform.system() == "Windows":
389
+ r = subprocess.run(
390
+ ["tasklist", "/FI", "IMAGENAME eq chrome.exe", "/FO", "CSV"],
391
+ capture_output=True, text=True,
392
+ )
393
+ return "chrome.exe" in r.stdout
394
+ else:
395
+ r = subprocess.run(["pgrep", "-x", "chrome", "chromium"], capture_output=True)
396
+ return r.returncode == 0
397
+
398
+
399
+ def _kill_chrome() -> None:
400
+ """强制关闭所有 Chrome 进程。"""
401
+ import subprocess, platform
402
+ if platform.system() == "Windows":
403
+ subprocess.run(["taskkill", "/F", "/IM", "chrome.exe"],
404
+ capture_output=True)
405
+ else:
406
+ subprocess.run(["pkill", "-x", "chrome"], capture_output=True)
407
+ subprocess.run(["pkill", "-x", "chromium"], capture_output=True)
408
+
409
+
410
+ def _find_real_chrome_profile() -> Optional[str]:
411
+ """找到真实 Chrome profile 目录(含登录态、cookies)。"""
412
+ import platform, os
413
+ system = platform.system()
414
+ if system == "Windows":
415
+ base = os.environ.get("LOCALAPPDATA", "")
416
+ path = os.path.join(base, "Google", "Chrome", "User Data")
417
+ elif system == "Darwin":
418
+ path = os.path.expanduser("~/Library/Application Support/Google/Chrome")
419
+ else:
420
+ path = os.path.expanduser("~/.config/google-chrome")
421
+ return path if os.path.isdir(path) else None
422
+
423
+
424
+ def _sync_cookies_from_real_profile(rr_profile: str) -> bool:
425
+ """
426
+ 从真实 Chrome profile 复制 cookies 到 rolling-reader profile。
427
+
428
+ 复制内容:
429
+ - Default/Cookies — cookie 数据(SQLite)
430
+ - Local State — DPAPI 加密密钥(解密 cookies 必须)
431
+
432
+ 必须在 Chrome 完全关闭后调用(避免 SQLite 锁冲突)。
433
+
434
+ Returns:
435
+ True — 复制成功
436
+ False — 真实 profile 不存在或文件不可读
437
+ """
438
+ import shutil, os
439
+
440
+ real = _find_real_chrome_profile()
441
+ if not real:
442
+ return False
443
+
444
+ # 需要同步的文件:(源相对路径, 目标相对路径)
445
+ # Chrome 新版本把 Cookies 移到了 Default/Network/Cookies
446
+ files_to_copy = [
447
+ (os.path.join("Default", "Network", "Cookies"), os.path.join("Default", "Network", "Cookies")),
448
+ (os.path.join("Default", "Cookies"), os.path.join("Default", "Cookies")), # 旧版路径保留兼容
449
+ ("Local State", "Local State"),
450
+ ]
451
+
452
+ copied = 0
453
+ for src_rel, dst_rel in files_to_copy:
454
+ src = os.path.join(real, src_rel)
455
+ dst = os.path.join(rr_profile, dst_rel)
456
+ if not os.path.exists(src):
457
+ continue
458
+ os.makedirs(os.path.dirname(dst), exist_ok=True)
459
+ try:
460
+ shutil.copy2(src, dst)
461
+ copied += 1
462
+ except OSError:
463
+ pass # 文件仍被锁(Chrome 未完全退出),跳过
464
+
465
+ return copied > 0
466
+
467
+
359
468
  def _find_chrome() -> Optional[str]:
360
469
  """在各平台上自动定位 Chrome 可执行文件。"""
361
470
  import platform
File without changes