PyPI - harvex - Versions diffs - 0.1.0__tar.gz - Mend

harvex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

harvex-0.1.0/.gitignore +25 -0
harvex-0.1.0/CHANGELOG.md +39 -0
harvex-0.1.0/LICENSE +21 -0
harvex-0.1.0/PKG-INFO +132 -0
harvex-0.1.0/README.md +95 -0
harvex-0.1.0/pyproject.toml +59 -0
harvex-0.1.0/src/harvex/__init__.py +50 -0
harvex-0.1.0/src/harvex/cli/__init__.py +1 -0
harvex-0.1.0/src/harvex/cli/main.py +238 -0
harvex-0.1.0/src/harvex/config/__init__.py +51 -0
harvex-0.1.0/src/harvex/config/defaults.py +54 -0
harvex-0.1.0/src/harvex/config/settings.py +355 -0
harvex-0.1.0/src/harvex/core/__init__.py +1 -0
harvex-0.1.0/src/harvex/core/context.py +41 -0
harvex-0.1.0/src/harvex/core/errors.py +49 -0
harvex-0.1.0/src/harvex/core/pipeline.py +136 -0
harvex-0.1.0/src/harvex/core/record.py +86 -0
harvex-0.1.0/src/harvex/core/registry.py +97 -0
harvex-0.1.0/src/harvex/core/runner.py +95 -0
harvex-0.1.0/src/harvex/core/source.py +79 -0
harvex-0.1.0/src/harvex/extras/__init__.py +1 -0
harvex-0.1.0/src/harvex/extras/browser/__init__.py +14 -0
harvex-0.1.0/src/harvex/extras/browser/browser_source.py +134 -0
harvex-0.1.0/src/harvex/extras/browser/runtime.py +129 -0
harvex-0.1.0/src/harvex/extras/llm/__init__.py +19 -0
harvex-0.1.0/src/harvex/extras/llm/cache.py +132 -0
harvex-0.1.0/src/harvex/extras/llm/enrich.py +89 -0
harvex-0.1.0/src/harvex/extras/llm/schema.py +112 -0
harvex-0.1.0/src/harvex/extras/llm/translator.py +215 -0
harvex-0.1.0/src/harvex/extras/tui/__init__.py +1 -0
harvex-0.1.0/src/harvex/extras/tui/panel.py +300 -0
harvex-0.1.0/src/harvex/extras/tui/textwidth.py +111 -0
harvex-0.1.0/src/harvex/extras/web/__init__.py +1 -0
harvex-0.1.0/src/harvex/extras/web/assets/app.js +155 -0
harvex-0.1.0/src/harvex/extras/web/assets/index.html +64 -0
harvex-0.1.0/src/harvex/extras/web/assets/styles.css +116 -0
harvex-0.1.0/src/harvex/extras/web/queries.py +138 -0
harvex-0.1.0/src/harvex/extras/web/service.py +229 -0
harvex-0.1.0/src/harvex/meta/__init__.py +18 -0
harvex-0.1.0/src/harvex/meta/health.py +138 -0
harvex-0.1.0/src/harvex/meta/metadata_db.py +297 -0
harvex-0.1.0/src/harvex/net/__init__.py +1 -0
harvex-0.1.0/src/harvex/net/http_client.py +197 -0
harvex-0.1.0/src/harvex/net/retry.py +101 -0
harvex-0.1.0/src/harvex/notify/__init__.py +12 -0
harvex-0.1.0/src/harvex/notify/notifier.py +65 -0
harvex-0.1.0/src/harvex/notify/webhook.py +200 -0
harvex-0.1.0/src/harvex/obs/__init__.py +10 -0
harvex-0.1.0/src/harvex/obs/logging.py +135 -0
harvex-0.1.0/src/harvex/py.typed +0 -0
harvex-0.1.0/src/harvex/scheduling/__init__.py +1 -0
harvex-0.1.0/src/harvex/scheduling/cron.py +37 -0
harvex-0.1.0/src/harvex/scheduling/launchd.py +87 -0
harvex-0.1.0/src/harvex/storage/__init__.py +17 -0
harvex-0.1.0/src/harvex/storage/database.py +326 -0
harvex-0.1.0/src/harvex/storage/schema.py +89 -0
harvex-0.1.0/src/harvex/storage/sink.py +38 -0
harvex-0.1.0/src/harvex/storage/sqlite_sink.py +166 -0
harvex-0.1.0/templates/project/.env.local.example +4 -0
harvex-0.1.0/templates/project/config.toml +21 -0
harvex-0.1.0/templates/project/fields.py +23 -0
harvex-0.1.0/templates/project/pyproject.toml +18 -0
harvex-0.1.0/templates/project/sources/__init__.py +1 -0
harvex-0.1.0/templates/project/sources/example_source.py +39 -0
harvex-0.1.0/tests/config/__init__.py +1 -0
harvex-0.1.0/tests/config/test_settings.py +57 -0
harvex-0.1.0/tests/conftest.py +65 -0
harvex-0.1.0/tests/core/__init__.py +1 -0
harvex-0.1.0/tests/core/test_pipeline_runner.py +90 -0
harvex-0.1.0/tests/core/test_record.py +184 -0
harvex-0.1.0/tests/core/test_registry.py +63 -0
harvex-0.1.0/tests/extras/__init__.py +0 -0
harvex-0.1.0/tests/extras/test_extras.py +64 -0
harvex-0.1.0/tests/meta/__init__.py +1 -0
harvex-0.1.0/tests/meta/test_metadata_health.py +55 -0
harvex-0.1.0/tests/net/__init__.py +1 -0
harvex-0.1.0/tests/net/test_http_client.py +70 -0
harvex-0.1.0/tests/notify/__init__.py +1 -0
harvex-0.1.0/tests/notify/test_webhook.py +44 -0
harvex-0.1.0/tests/storage/__init__.py +1 -0
harvex-0.1.0/tests/storage/test_sqlite_sink.py +46 -0
harvex-0.1.0/uv.lock +751 -0

harvex-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,25 @@
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+.pytest_cache/
+.coverage
+htmlcov/
+build/
+dist/
+# 本地数据与日志（框架自身仓库不存放运行产物）
+logs/*.log
+*.db
+*.db-wal
+*.db-shm
+database_backup/
+# 密钥
+.env.local
+.env
+# 系统
+.DS_Store
+.playwright-mcp/

harvex-0.1.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,39 @@
+# Changelog
+本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。
+## [0.1.0] - 2026-06-24
+首个公开发布。AI 时代的数据采集基座，为 AI agent 与 vibecoding 设计。
+### 核心
+- `HarvestRecord`（pydantic v2）字段收口契约：未声明字段自动折叠进 extra 列，脏数据写库前拦截。
+- `BaseSource` + `SourceProfile` 类契约，`SourceRegistry` 扫描 `sources/` 自动发现。
+- `Pipeline` 两阶段编排（并行采集 / 主线程串行落地，规避 SQLite 跨线程问题）。
+- `run_sources` 并发统筹：故障隔离、轮级备份、汇总、失败/异常告警。
+### 存储 / 元数据 / 韧性
+- SQLite Sink：建表/自动补列 + upsert 去重 + 轮级备份；`Sink` 抽象预留多出口扩展。
+- 元数据流水库 + 数据健康检查（归零 / 骤降识别）。
+- `httpx` 统一客户端 + `tenacity` 指数退避重试；stdlib 结构化日志；webhook 告警（Bark/飞书/Server酱）。
+### CLI / 调度
+- `harvex` CLI：`list / run / health / gen-launchd / gen-cron / web / tui`。
+- 生成 macOS launchd plist 与 crontab，调度与 Web 解耦。
+### 可选扩展（extras）
+- `[web]`：stdlib http.server 只读浏览 UI（零第三方依赖）。
+- `[llm]`：OpenAI 翻译 + 缓存 + 重试、一句话增强。
+- `[tui]`：本地终端控制面板。
+- `[browser]`：playwright 渲染型源基类。
+### 工程
+- 42 个测试（零真实网络 / OpenAI / 浏览器）。
+- `py.typed` 类型标记，核心零重依赖。
+[0.1.0]: https://github.com/fangxuanxin/harvex/releases/tag/v0.1.0

harvex-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Stephen Fang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

harvex-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,132 @@
+Metadata-Version: 2.4
+Name: harvex
+Version: 0.1.0
+Summary: AI 时代的数据采集基座 —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架
+Project-URL: Homepage, https://github.com/fangxuanxin/harvex
+Project-URL: Repository, https://github.com/fangxuanxin/harvex
+Project-URL: Issues, https://github.com/fangxuanxin/harvex/issues
+Project-URL: Changelog, https://github.com/fangxuanxin/harvex/blob/main/CHANGELOG.md
+Author-email: Stephen Fang <qazwsx80@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: ai-agent,crawler,data-harvesting,etl,framework,llm,pipeline,scraping,sqlite,vibecoding
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Database
+Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
+Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
+Requires-Python: >=3.11
+Requires-Dist: httpx>=0.27
+Requires-Dist: pydantic-settings>=2.2
+Requires-Dist: pydantic>=2.6
+Requires-Dist: tenacity>=8.2
+Provides-Extra: browser
+Requires-Dist: playwright>=1.42; extra == 'browser'
+Provides-Extra: dev
+Requires-Dist: pytest-cov>=5.0; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Provides-Extra: llm
+Requires-Dist: openai>=1.30; extra == 'llm'
+Provides-Extra: tui
+Provides-Extra: web
+Description-Content-Type: text/markdown
+# harvex
+**AI 时代的数据采集基座** —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架。
+让 AI（或你自己 vibecoding）只需写「**怎么抓、怎么解析**」这一件事，其余全部交给框架：
+并发调度、字段收口、写库去重、元数据流水、HTTP 重试、日志、告警、数据健康检查、
+定时调度、Web 浏览、LLM 翻译增强、TUI 控制面板。
+```bash
+pip install harvex          # 核心零重依赖
+pip install "harvex[web,llm,browser,tui]"   # 按需启用扩展
+```
+## 为什么是「AI 时代的采集基座」
+让 LLM 写爬虫时，模型最擅长的是「这个页面/接口怎么解析成结构化数据」，最不擅长、
+也最容易写错的是周边工程：重试退避、并发隔离、增量去重、schema 漂移、调度、可观测。
+harvex 把后者全部沉淀成稳定基座，给 AI 留下一个**极窄、极稳的契约面**：
+```python
+from harvex import BaseSource, SourceProfile
+class GithubTrending(BaseSource):
+    profile = SourceProfile(slug="gh_trending", name="GitHub Trending")
+    def fetch(self):
+        return self.ctx.http.get_json("https://api.example.com/trending")
+    def parse(self, raw):
+        for item in raw["items"]:
+            yield {"标题": item["name"], "star": item["stars"]}
+```
+AI 只要产出这样一个类，`harvex run` 就能跑通采集→校验→去重→入库→流水→健康检查全链路。
+新增字段不会撑爆主表（自动折叠进 extra 列），一个源挂掉不影响整轮，脏数据写库前被拦截。
+## 设计原则
+- **核心零重依赖**：core 只依赖 `pydantic` / `httpx` / `tenacity`。`playwright`、`openai`、Web、TUI 都是按需安装的 `extras`。
+- **字段收口是框架契约**：用 `HarvestRecord`（pydantic v2）守住「不让主表变稀疏矩阵」的纪律，未声明字段自动折叠，脏数据写库前拦截。
+- **故障隔离**：一个源挂掉不影响整轮抓取。
+- **存储先 SQLite，签名预留 Sink 抽象**：开箱即用，又留好扩展接入点。
+- **调度与 Web 解耦**：CLI + 系统 launchd/cron，不把定时寄生在 Web 进程里。
+## 分层
+```
+sources/*.py (你/AI 写)      BaseSource 子类：fetch() + parse()
+     ↓ raw → list[dict]
+core/pipeline               校验(pydantic) → 收口(extra 折叠) → 写库 → 流水 → 健康检查
+     ↓
+storage/sqlite_sink         建表/补列 + upsert 去重 + 轮级备份
+     ↓
+SQLite 业务库 + 元信息库
+     ↓ (extras)
+extras/web  只读浏览    extras/llm  翻译润色    extras/tui  控制面板
+统筹：core/runner（并发） + cli（harvex run / health / gen-launchd）
+```
+## 新项目骨架
+```
+my_project/
+├── config.toml         # 数据源开关/调度/筛选/通知
+├── .env.local          # 密钥（openai key、webhook url）
+├── fields.py           # 你的 HarvestRecord 子类 —— 标准字段
+├── sources/            # 一源一文件，只写 fetch/parse
+└── database/  logs/
+```
+完整可跑模板见仓库 [`templates/project/`](templates/project)。
+## CLI
+```bash
+harvex list                 # 列出已发现的数据源
+harvex run --all            # 跑一轮全部源
+harvex run gh_trending      # 跑指定源
+harvex health               # 数据健康检查（归零/骤降）
+harvex gen-launchd          # 生成 macOS launchd 定时配置
+harvex gen-cron             # 生成 crontab 行
+harvex web                  # 启动只读浏览 UI（需 [web]）
+harvex tui                  # 启动本地控制面板（需 [tui]）
+```
+## 开发
+```bash
+uv venv && uv pip install -e ".[dev]"
+uv run pytest
+```
+## License
+MIT © Stephen Fang

harvex-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,95 @@
+# harvex
+**AI 时代的数据采集基座** —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架。
+让 AI（或你自己 vibecoding）只需写「**怎么抓、怎么解析**」这一件事，其余全部交给框架：
+并发调度、字段收口、写库去重、元数据流水、HTTP 重试、日志、告警、数据健康检查、
+定时调度、Web 浏览、LLM 翻译增强、TUI 控制面板。
+```bash
+pip install harvex          # 核心零重依赖
+pip install "harvex[web,llm,browser,tui]"   # 按需启用扩展
+```
+## 为什么是「AI 时代的采集基座」
+让 LLM 写爬虫时，模型最擅长的是「这个页面/接口怎么解析成结构化数据」，最不擅长、
+也最容易写错的是周边工程：重试退避、并发隔离、增量去重、schema 漂移、调度、可观测。
+harvex 把后者全部沉淀成稳定基座，给 AI 留下一个**极窄、极稳的契约面**：
+```python
+from harvex import BaseSource, SourceProfile
+class GithubTrending(BaseSource):
+    profile = SourceProfile(slug="gh_trending", name="GitHub Trending")
+    def fetch(self):
+        return self.ctx.http.get_json("https://api.example.com/trending")
+    def parse(self, raw):
+        for item in raw["items"]:
+            yield {"标题": item["name"], "star": item["stars"]}
+```
+AI 只要产出这样一个类，`harvex run` 就能跑通采集→校验→去重→入库→流水→健康检查全链路。
+新增字段不会撑爆主表（自动折叠进 extra 列），一个源挂掉不影响整轮，脏数据写库前被拦截。
+## 设计原则
+- **核心零重依赖**：core 只依赖 `pydantic` / `httpx` / `tenacity`。`playwright`、`openai`、Web、TUI 都是按需安装的 `extras`。
+- **字段收口是框架契约**：用 `HarvestRecord`（pydantic v2）守住「不让主表变稀疏矩阵」的纪律，未声明字段自动折叠，脏数据写库前拦截。
+- **故障隔离**：一个源挂掉不影响整轮抓取。
+- **存储先 SQLite，签名预留 Sink 抽象**：开箱即用，又留好扩展接入点。
+- **调度与 Web 解耦**：CLI + 系统 launchd/cron，不把定时寄生在 Web 进程里。
+## 分层
+```
+sources/*.py (你/AI 写)      BaseSource 子类：fetch() + parse()
+     ↓ raw → list[dict]
+core/pipeline               校验(pydantic) → 收口(extra 折叠) → 写库 → 流水 → 健康检查
+     ↓
+storage/sqlite_sink         建表/补列 + upsert 去重 + 轮级备份
+     ↓
+SQLite 业务库 + 元信息库
+     ↓ (extras)
+extras/web  只读浏览    extras/llm  翻译润色    extras/tui  控制面板
+统筹：core/runner（并发） + cli（harvex run / health / gen-launchd）
+```
+## 新项目骨架
+```
+my_project/
+├── config.toml         # 数据源开关/调度/筛选/通知
+├── .env.local          # 密钥（openai key、webhook url）
+├── fields.py           # 你的 HarvestRecord 子类 —— 标准字段
+├── sources/            # 一源一文件，只写 fetch/parse
+└── database/  logs/
+```
+完整可跑模板见仓库 [`templates/project/`](templates/project)。
+## CLI
+```bash
+harvex list                 # 列出已发现的数据源
+harvex run --all            # 跑一轮全部源
+harvex run gh_trending      # 跑指定源
+harvex health               # 数据健康检查（归零/骤降）
+harvex gen-launchd          # 生成 macOS launchd 定时配置
+harvex gen-cron             # 生成 crontab 行
+harvex web                  # 启动只读浏览 UI（需 [web]）
+harvex tui                  # 启动本地控制面板（需 [tui]）
+```
+## 开发
+```bash
+uv venv && uv pip install -e ".[dev]"
+uv run pytest
+```
+## License
+MIT © Stephen Fang

harvex-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,59 @@
+[project]
+name = "harvex"
+version = "0.1.0"
+description = "AI 时代的数据采集基座 —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "MIT" }
+authors = [{ name = "Stephen Fang", email = "qazwsx80@gmail.com" }]
+keywords = [
+    "data-harvesting", "scraping", "crawler", "ai-agent", "vibecoding",
+    "sqlite", "pipeline", "etl", "framework", "llm",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Libraries :: Application Frameworks",
+    "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+    "Topic :: Database",
+]
+# 核心零重依赖：只保留 pydantic / httpx / tenacity 三件套
+dependencies = [
+    "pydantic>=2.6",
+    "pydantic-settings>=2.2",
+    "httpx>=0.27",
+    "tenacity>=8.2",
+]
+[project.optional-dependencies]
+# 可选扩展：按需安装，核心永不强拉这些重依赖
+browser = ["playwright>=1.42"]
+llm = ["openai>=1.30"]
+web = []          # 只读浏览 UI 走 stdlib http.server，无第三方依赖
+tui = []          # 本地控制面板走 stdlib，无第三方依赖
+dev = ["pytest>=8.0", "pytest-cov>=5.0"]
+[project.urls]
+Homepage = "https://github.com/fangxuanxin/harvex"
+Repository = "https://github.com/fangxuanxin/harvex"
+Issues = "https://github.com/fangxuanxin/harvex/issues"
+Changelog = "https://github.com/fangxuanxin/harvex/blob/main/CHANGELOG.md"
+[project.scripts]
+# CLI 入口：harvex run / list / health / gen-launchd / web / tui
+harvex = "harvex.cli.main:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/harvex"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-q"

harvex-0.1.0/src/harvex/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""harvex：可本地复用的数据采集框架库。
+公开 API —— 下游项目只需从这里导入：
+    from harvex import BaseSource, SourceProfile, HarvestRecord, run_sources
+核心契约（record/source/context/errors/sink）始终可用；
+runner/pipeline 等编排能力在对应模块就绪后由本文件统一导出。
+"""
+from __future__ import annotations
+from .core.context import SourceContext
+from .core.errors import (
+    ConfigError,
+    FetchError,
+    HarvexError,
+    ParseError,
+    RecordValidationError,
+    SinkError,
+)
+from .core.pipeline import Pipeline, SourceResult
+from .core.record import HarvestRecord
+from .core.registry import SourceRegistry
+from .core.runner import RunReport, run_sources
+from .core.source import BaseSource, SourceProfile
+from .storage.sink import Sink, WriteResult
+__version__ = "0.1.0"
+__all__ = [
+    "BaseSource",
+    "SourceProfile",
+    "SourceContext",
+    "HarvestRecord",
+    "SourceRegistry",
+    "Pipeline",
+    "SourceResult",
+    "run_sources",
+    "RunReport",
+    "Sink",
+    "WriteResult",
+    "HarvexError",
+    "ConfigError",
+    "FetchError",
+    "ParseError",
+    "RecordValidationError",
+    "SinkError",
+    "__version__",
+]

harvex-0.1.0/src/harvex/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """harvex 子模块。"""

harvex-0.1.0/src/harvex/cli/main.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""harvex 命令行入口。
+把配置、源发现、各层组装成可运行的应用，并暴露子命令：
+    harvex list                 列出已发现的数据源
+    harvex run --all            跑一轮全部启用的源
+    harvex run icbc apple       跑指定源
+    harvex health               数据健康检查（归零/骤降）
+    harvex gen-launchd          生成 macOS launchd plist
+    harvex gen-cron             生成 crontab 行
+    harvex web                  启动只读浏览 UI（需 [web]）
+    harvex tui                  启动本地控制面板（需 [tui]）
+约定：在下游项目根目录（含 config.toml / sources/）运行，或用 --project 指定。
+"""
+from __future__ import annotations
+import argparse
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from ..config.settings import Settings, load_settings
+from ..core.pipeline import Pipeline
+from ..core.record import HarvestRecord
+from ..core.registry import SourceRegistry
+from ..core.runner import run_sources
+from ..meta.health import scan_health
+from ..meta.metadata_db import MetadataDatabase
+from ..net.http_client import HttpClient
+from ..net.retry import RetryPolicy
+from ..notify.webhook import WebhookNotifier
+from ..obs.logging import get_logger, setup_logging
+from ..storage.sqlite_sink import SQLiteSink
+@dataclass
+class App:
+    """组装好的运行期应用：配置 + 注册表 + 各层 + pipeline。"""
+    settings: Settings
+    registry: SourceRegistry
+    pipeline: Pipeline
+    sink: SQLiteSink
+    meta: MetadataDatabase
+    http: HttpClient
+    def close(self) -> None:
+        for closer in (self.http, self.meta, self.sink):
+            try:
+                closer.close()
+            except Exception:  # noqa: BLE001 关闭尽力而为
+                pass
+def _resolve_record_model(registry: SourceRegistry) -> type[HarvestRecord]:
+    """从已发现的源里取业务记录模型（本项目所有源共用一个业务表/模型）。"""
+    models = {cls.record_model for cls in registry.all()}
+    if not models:
+        return HarvestRecord
+    if len(models) > 1:
+        get_logger().warning("发现多个记录模型，使用首个：%s", sorted(m.__name__ for m in models))
+    return registry.all()[0].record_model
+def build_app(project_dir: str | Path) -> App:
+    """从项目目录组装应用：加载配置 → 发现源 → 构建各层 → pipeline。"""
+    project_dir = Path(project_dir).resolve()
+    settings = load_settings(project_dir)
+    setup_logging(settings.log_dir)
+    registry = SourceRegistry()
+    sources_dir = project_dir / "sources"
+    if sources_dir.is_dir():
+        registry.discover_dir(sources_dir)
+    record_model = _resolve_record_model(registry)
+    http = HttpClient(
+        timeout=settings.http.timeout,
+        user_agent=settings.http.user_agent,
+        retry=RetryPolicy(attempts=settings.http.retry_attempts),
+    )
+    # 写库的轮级备份由 runner 统一触发，故 sink 自身关掉每写即备份
+    sink = SQLiteSink(
+        settings.storage.db_path, record_model,
+        table=settings.storage.table, backup=False, backup_keep=settings.storage.backup_keep,
+    )
+    meta = MetadataDatabase(settings.storage.meta_db_path)
+    notifier = WebhookNotifier.from_config({"url": settings.notify.url, "kind": settings.notify.kind})
+    source_config = {slug: dict(sc.options, **{"schedule": sc.schedule, "channel": sc.channel})
+                     for slug, sc in settings.sources.items()}
+    pipeline = Pipeline(
+        sink=sink, meta=meta, http=http,
+        drop_ratio=settings.drop_ratio, notifier=notifier, source_config=source_config,
+    )
+    return App(settings=settings, registry=registry, pipeline=pipeline, sink=sink, meta=meta, http=http)
+def _select_sources(app: App, slugs: list[str], run_all: bool):
+    """根据命令行参数挑选要跑的源（兼顾 config 的 enabled 软开关）。"""
+    settings = app.settings
+    if run_all or not slugs:
+        chosen = []
+        for cls in app.registry.enabled():
+            sc = settings.sources.get(cls.profile.slug)
+            if sc is not None and not sc.enabled:
+                continue  # config 显式禁用
+            chosen.append(cls)
+        return chosen
+    return [app.registry.get(s) for s in slugs]
+# ---------------- 子命令 ----------------
+def cmd_list(app: App, args) -> int:
+    if not app.registry.all():
+        print("（未发现任何数据源；确认 sources/ 目录存在且含 BaseSource 子类）")
+        return 0
+    print(f"已发现 {len(app.registry.all())} 个数据源：")
+    for cls in app.registry.all():
+        p = cls.profile
+        sc = app.settings.sources.get(p.slug)
+        enabled = (sc.enabled if sc else p.enabled)
+        flag = "✓" if enabled else "✗"
+        print(f"  [{flag}] {p.slug:<16} {p.name}  (channel={p.channel or '-'}, schedule={p.schedule or '-'})")
+    return 0
+def cmd_run(app: App, args) -> int:
+    sources = _select_sources(app, args.slugs, args.all)
+    if not sources:
+        print("没有可运行的源。")
+        return 1
+    report = run_sources(sources, app.pipeline, max_workers=args.workers)
+    print("\n" + report.summary_line())
+    for r in report.results:
+        mark = {"success": "✓", "anomaly": "!", "failed": "✗"}.get(r.status, "?")
+        extra = r.error or (r.health.reason if r.health else "")
+        print(f"  [{mark}] {r.slug:<16} 收到{r.received} 新增{r.inserted} 更新{r.updated} 总{r.total_after} {extra}")
+    return 0 if not report.failed else 2
+def cmd_health(app: App, args) -> int:
+    statuses = scan_health(app.meta, drop_ratio=app.settings.drop_ratio)
+    if not statuses:
+        print("（暂无抓取历史可供体检）")
+        return 0
+    anomalies = [s for s in statuses if s.status == "anomaly"]
+    for s in statuses:
+        mark = "!" if s.status == "anomaly" else "✓"
+        print(f"  [{mark}] {s.source_slug:<16} 当前{s.current} 上轮{s.previous} {s.reason}")
+    return 0 if not anomalies else 2
+def cmd_gen_launchd(app: App, args) -> int:
+    from ..scheduling.launchd import generate_launchd_plist, install_hint
+    times = args.at.split(",") if args.at else ["10:00", "17:00"]
+    label = args.label or "com.harvex"
+    print(generate_launchd_plist(label=label, project_dir=app.settings.project_dir, times=times))
+    print("\n" + install_hint(label), file=sys.stderr)
+    return 0
+def cmd_gen_cron(app: App, args) -> int:
+    from ..scheduling.cron import generate_cron_line
+    times = args.at.split(",") if args.at else ["10:00", "17:00"]
+    print(generate_cron_line(project_dir=app.settings.project_dir, times=times))
+    return 0
+def cmd_web(app: App, args) -> int:
+    try:
+        from ..extras.web.service import serve
+    except ImportError as error:
+        print(f"Web 扩展不可用：{error}", file=sys.stderr)
+        return 1
+    serve(app, host=args.host, port=args.port)
+    return 0
+def cmd_tui(app: App, args) -> int:
+    try:
+        from ..extras.tui.panel import run_panel
+    except ImportError as error:
+        print(f"TUI 扩展不可用：{error}", file=sys.stderr)
+        return 1
+    run_panel(app)
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="harvex", description="本地数据采集框架 CLI")
+    parser.add_argument("--project", default=".", help="项目目录（默认当前目录）")
+    sub = parser.add_subparsers(dest="command", required=True)
+    sub.add_parser("list", help="列出数据源")
+    p_run = sub.add_parser("run", help="跑一轮抓取")
+    p_run.add_argument("slugs", nargs="*", help="指定源 slug；省略则跑全部")
+    p_run.add_argument("--all", action="store_true", help="跑全部启用的源")
+    p_run.add_argument("--workers", type=int, default=4, help="并发数")
+    sub.add_parser("health", help="数据健康检查")
+    p_ld = sub.add_parser("gen-launchd", help="生成 launchd plist")
+    p_ld.add_argument("--at", help="触发时间点，逗号分隔，如 10:00,17:00")
+    p_ld.add_argument("--label", help="launchd Label")
+    p_cron = sub.add_parser("gen-cron", help="生成 crontab 行")
+    p_cron.add_argument("--at", help="触发时间点，逗号分隔")
+    p_web = sub.add_parser("web", help="启动只读浏览 UI")
+    p_web.add_argument("--host", default="127.0.0.1")
+    p_web.add_argument("--port", type=int, default=8765)
+    sub.add_parser("tui", help="启动本地控制面板")
+    return parser
+_HANDLERS = {
+    "list": cmd_list, "run": cmd_run, "health": cmd_health,
+    "gen-launchd": cmd_gen_launchd, "gen-cron": cmd_gen_cron,
+    "web": cmd_web, "tui": cmd_tui,
+}
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    app = build_app(args.project)
+    try:
+        return _HANDLERS[args.command](app, args)
+    finally:
+        app.close()
+if __name__ == "__main__":
+    raise SystemExit(main())