rolling-reader 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/PKG-INFO +1 -1
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/pyproject.toml +1 -1
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/cli.py +134 -25
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/.gitignore +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/README.md +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/__init__.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/cache/__init__.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/cache/profile.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/dispatcher.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/extractor/__init__.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/extractor/cdp.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/extractor/clean.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/extractor/http.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/extractor/state.py +0 -0
- {rolling_reader-0.6.0 → rolling_reader-0.6.2}/src/rolling_reader/models.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rolling-reader"
|
|
7
|
-
version = "0.6.
|
|
7
|
+
version = "0.6.2"
|
|
8
8
|
description = "Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -17,6 +17,7 @@ import asyncio
|
|
|
17
17
|
import json
|
|
18
18
|
import sys
|
|
19
19
|
from enum import Enum
|
|
20
|
+
from pathlib import Path
|
|
20
21
|
from typing import Optional
|
|
21
22
|
|
|
22
23
|
import typer
|
|
@@ -267,31 +268,42 @@ async def _run_batch(
|
|
|
267
268
|
@app.command(name="chrome")
|
|
268
269
|
def launch_chrome(
|
|
269
270
|
port: int = typer.Option(9222, "--port", "-p", help="Remote debugging port (default: 9222)"),
|
|
271
|
+
fresh: bool = typer.Option(False, "--fresh", help="Use a clean profile instead of your real Chrome profile"),
|
|
270
272
|
) -> None:
|
|
271
|
-
"""Launch Chrome with remote debugging
|
|
273
|
+
"""Launch Chrome with remote debugging, using your real login session.
|
|
272
274
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
+
Chrome must be fully closed before running this command.
|
|
276
|
+
Your existing logins, cookies, and history are preserved.
|
|
275
277
|
|
|
276
278
|
Example:
|
|
277
279
|
|
|
278
|
-
rr chrome
|
|
279
|
-
rr https://
|
|
280
|
+
rr chrome # close Chrome first, then run this
|
|
281
|
+
rr https://twitter.com/home # scrapes with your login
|
|
280
282
|
"""
|
|
281
283
|
import subprocess
|
|
282
284
|
import platform
|
|
283
|
-
|
|
284
285
|
import asyncio
|
|
285
286
|
import time
|
|
286
287
|
import os
|
|
287
|
-
import tempfile
|
|
288
288
|
|
|
289
|
-
#
|
|
289
|
+
# 先检查端口是否已经在用(已有调试模式 Chrome)
|
|
290
290
|
if asyncio.run(_check_cdp(port)):
|
|
291
291
|
typer.echo(f"Chrome is already running with remote debugging on port {port}.")
|
|
292
292
|
typer.echo("Ready — run: rr <url>")
|
|
293
293
|
return
|
|
294
294
|
|
|
295
|
+
# 检查是否有 Chrome 后台进程,有的话先杀掉
|
|
296
|
+
# (Chrome 关窗口后仍可能有后台进程,会阻止新实例开调试端口)
|
|
297
|
+
if _chrome_is_running():
|
|
298
|
+
typer.echo("Closing Chrome background processes...", err=True)
|
|
299
|
+
_kill_chrome()
|
|
300
|
+
# 等进程完全退出(最多 10 秒,比固定 3 秒更可靠)
|
|
301
|
+
deadline_kill = time.time() + 10
|
|
302
|
+
while time.time() < deadline_kill:
|
|
303
|
+
time.sleep(0.5)
|
|
304
|
+
if not _chrome_is_running():
|
|
305
|
+
break
|
|
306
|
+
|
|
295
307
|
exe = _find_chrome()
|
|
296
308
|
if exe is None:
|
|
297
309
|
typer.echo(
|
|
@@ -301,36 +313,50 @@ def launch_chrome(
|
|
|
301
313
|
)
|
|
302
314
|
raise typer.Exit(code=1)
|
|
303
315
|
|
|
304
|
-
#
|
|
305
|
-
|
|
306
|
-
os.
|
|
316
|
+
# rolling-reader 专用的持久化 profile 目录
|
|
317
|
+
# 每次 rr chrome 都用同一个目录 → 登录状态永久保留
|
|
318
|
+
profile_dir = os.path.join(Path.home(), ".rolling-reader", "chrome-profile")
|
|
319
|
+
os.makedirs(profile_dir, exist_ok=True)
|
|
320
|
+
|
|
321
|
+
if fresh:
|
|
322
|
+
import shutil
|
|
323
|
+
shutil.rmtree(profile_dir, ignore_errors=True)
|
|
324
|
+
os.makedirs(profile_dir, exist_ok=True)
|
|
325
|
+
typer.echo("Starting with a fresh profile (login state cleared).", err=True)
|
|
326
|
+
else:
|
|
327
|
+
# 从真实 Chrome profile 同步 cookies(Chrome 已关闭,安全复制)
|
|
328
|
+
synced = _sync_cookies_from_real_profile(str(profile_dir))
|
|
329
|
+
if synced:
|
|
330
|
+
typer.echo("Synced cookies from your Chrome profile.", err=True)
|
|
331
|
+
else:
|
|
332
|
+
first_time = not any(os.scandir(profile_dir))
|
|
333
|
+
if first_time:
|
|
334
|
+
typer.echo(
|
|
335
|
+
"First run: a new browser profile will open.\n"
|
|
336
|
+
"Log in to any sites you need — your login state will be saved for next time.",
|
|
337
|
+
err=True,
|
|
338
|
+
)
|
|
339
|
+
else:
|
|
340
|
+
typer.echo("Using saved profile (your logins are preserved).", err=True)
|
|
307
341
|
|
|
308
342
|
args = [
|
|
309
343
|
exe,
|
|
310
344
|
f"--remote-debugging-port={port}",
|
|
311
|
-
f"--user-data-dir={
|
|
345
|
+
f"--user-data-dir={profile_dir}",
|
|
312
346
|
"--remote-allow-origins=*",
|
|
313
347
|
"--no-first-run",
|
|
314
348
|
"--no-default-browser-check",
|
|
315
349
|
]
|
|
316
350
|
|
|
317
|
-
system = platform.system()
|
|
318
351
|
try:
|
|
319
|
-
|
|
320
|
-
subprocess.Popen(
|
|
321
|
-
args,
|
|
322
|
-
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
323
|
-
close_fds=True,
|
|
324
|
-
)
|
|
325
|
-
else:
|
|
326
|
-
subprocess.Popen(args, start_new_session=True, close_fds=True)
|
|
352
|
+
subprocess.Popen(args, close_fds=True)
|
|
327
353
|
except Exception as e:
|
|
328
354
|
typer.echo(f"Error: failed to launch Chrome: {e}", err=True)
|
|
329
355
|
raise typer.Exit(code=1)
|
|
330
356
|
|
|
331
|
-
# 等待 Chrome
|
|
357
|
+
# 等待 Chrome 初始化(最多 15 秒)
|
|
332
358
|
typer.echo("Starting Chrome...", err=True)
|
|
333
|
-
deadline = time.time() +
|
|
359
|
+
deadline = time.time() + 15
|
|
334
360
|
while time.time() < deadline:
|
|
335
361
|
time.sleep(0.5)
|
|
336
362
|
if asyncio.run(_check_cdp(port)):
|
|
@@ -339,8 +365,8 @@ def launch_chrome(
|
|
|
339
365
|
return
|
|
340
366
|
|
|
341
367
|
typer.echo(
|
|
342
|
-
|
|
343
|
-
"
|
|
368
|
+
"Chrome is starting — if it doesn't respond, make sure all Chrome windows were closed first.\n"
|
|
369
|
+
"Then retry: rr chrome",
|
|
344
370
|
err=True,
|
|
345
371
|
)
|
|
346
372
|
|
|
@@ -356,6 +382,89 @@ async def _check_cdp(port: int) -> bool:
|
|
|
356
382
|
return False
|
|
357
383
|
|
|
358
384
|
|
|
385
|
+
def _chrome_is_running() -> bool:
|
|
386
|
+
"""检查是否有 Chrome 进程在运行。"""
|
|
387
|
+
import subprocess, platform
|
|
388
|
+
if platform.system() == "Windows":
|
|
389
|
+
r = subprocess.run(
|
|
390
|
+
["tasklist", "/FI", "IMAGENAME eq chrome.exe", "/FO", "CSV"],
|
|
391
|
+
capture_output=True, text=True,
|
|
392
|
+
)
|
|
393
|
+
return "chrome.exe" in r.stdout
|
|
394
|
+
else:
|
|
395
|
+
r = subprocess.run(["pgrep", "-x", "chrome", "chromium"], capture_output=True)
|
|
396
|
+
return r.returncode == 0
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _kill_chrome() -> None:
|
|
400
|
+
"""强制关闭所有 Chrome 进程。"""
|
|
401
|
+
import subprocess, platform
|
|
402
|
+
if platform.system() == "Windows":
|
|
403
|
+
subprocess.run(["taskkill", "/F", "/IM", "chrome.exe"],
|
|
404
|
+
capture_output=True)
|
|
405
|
+
else:
|
|
406
|
+
subprocess.run(["pkill", "-x", "chrome"], capture_output=True)
|
|
407
|
+
subprocess.run(["pkill", "-x", "chromium"], capture_output=True)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _find_real_chrome_profile() -> Optional[str]:
|
|
411
|
+
"""找到真实 Chrome profile 目录(含登录态、cookies)。"""
|
|
412
|
+
import platform, os
|
|
413
|
+
system = platform.system()
|
|
414
|
+
if system == "Windows":
|
|
415
|
+
base = os.environ.get("LOCALAPPDATA", "")
|
|
416
|
+
path = os.path.join(base, "Google", "Chrome", "User Data")
|
|
417
|
+
elif system == "Darwin":
|
|
418
|
+
path = os.path.expanduser("~/Library/Application Support/Google/Chrome")
|
|
419
|
+
else:
|
|
420
|
+
path = os.path.expanduser("~/.config/google-chrome")
|
|
421
|
+
return path if os.path.isdir(path) else None
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _sync_cookies_from_real_profile(rr_profile: str) -> bool:
|
|
425
|
+
"""
|
|
426
|
+
从真实 Chrome profile 复制 cookies 到 rolling-reader profile。
|
|
427
|
+
|
|
428
|
+
复制内容:
|
|
429
|
+
- Default/Cookies — cookie 数据(SQLite)
|
|
430
|
+
- Local State — DPAPI 加密密钥(解密 cookies 必须)
|
|
431
|
+
|
|
432
|
+
必须在 Chrome 完全关闭后调用(避免 SQLite 锁冲突)。
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
True — 复制成功
|
|
436
|
+
False — 真实 profile 不存在或文件不可读
|
|
437
|
+
"""
|
|
438
|
+
import shutil, os
|
|
439
|
+
|
|
440
|
+
real = _find_real_chrome_profile()
|
|
441
|
+
if not real:
|
|
442
|
+
return False
|
|
443
|
+
|
|
444
|
+
# 需要同步的文件:(源相对路径, 目标相对路径)
|
|
445
|
+
# Chrome 新版本把 Cookies 移到了 Default/Network/Cookies
|
|
446
|
+
files_to_copy = [
|
|
447
|
+
(os.path.join("Default", "Network", "Cookies"), os.path.join("Default", "Network", "Cookies")),
|
|
448
|
+
(os.path.join("Default", "Cookies"), os.path.join("Default", "Cookies")), # 旧版路径保留兼容
|
|
449
|
+
("Local State", "Local State"),
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
copied = 0
|
|
453
|
+
for src_rel, dst_rel in files_to_copy:
|
|
454
|
+
src = os.path.join(real, src_rel)
|
|
455
|
+
dst = os.path.join(rr_profile, dst_rel)
|
|
456
|
+
if not os.path.exists(src):
|
|
457
|
+
continue
|
|
458
|
+
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
|
459
|
+
try:
|
|
460
|
+
shutil.copy2(src, dst)
|
|
461
|
+
copied += 1
|
|
462
|
+
except OSError:
|
|
463
|
+
pass # 文件仍被锁(Chrome 未完全退出),跳过
|
|
464
|
+
|
|
465
|
+
return copied > 0
|
|
466
|
+
|
|
467
|
+
|
|
359
468
|
def _find_chrome() -> Optional[str]:
|
|
360
469
|
"""在各平台上自动定位 Chrome 可执行文件。"""
|
|
361
470
|
import platform
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|