harvex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. harvex-0.1.0/.gitignore +25 -0
  2. harvex-0.1.0/CHANGELOG.md +39 -0
  3. harvex-0.1.0/LICENSE +21 -0
  4. harvex-0.1.0/PKG-INFO +132 -0
  5. harvex-0.1.0/README.md +95 -0
  6. harvex-0.1.0/pyproject.toml +59 -0
  7. harvex-0.1.0/src/harvex/__init__.py +50 -0
  8. harvex-0.1.0/src/harvex/cli/__init__.py +1 -0
  9. harvex-0.1.0/src/harvex/cli/main.py +238 -0
  10. harvex-0.1.0/src/harvex/config/__init__.py +51 -0
  11. harvex-0.1.0/src/harvex/config/defaults.py +54 -0
  12. harvex-0.1.0/src/harvex/config/settings.py +355 -0
  13. harvex-0.1.0/src/harvex/core/__init__.py +1 -0
  14. harvex-0.1.0/src/harvex/core/context.py +41 -0
  15. harvex-0.1.0/src/harvex/core/errors.py +49 -0
  16. harvex-0.1.0/src/harvex/core/pipeline.py +136 -0
  17. harvex-0.1.0/src/harvex/core/record.py +86 -0
  18. harvex-0.1.0/src/harvex/core/registry.py +97 -0
  19. harvex-0.1.0/src/harvex/core/runner.py +95 -0
  20. harvex-0.1.0/src/harvex/core/source.py +79 -0
  21. harvex-0.1.0/src/harvex/extras/__init__.py +1 -0
  22. harvex-0.1.0/src/harvex/extras/browser/__init__.py +14 -0
  23. harvex-0.1.0/src/harvex/extras/browser/browser_source.py +134 -0
  24. harvex-0.1.0/src/harvex/extras/browser/runtime.py +129 -0
  25. harvex-0.1.0/src/harvex/extras/llm/__init__.py +19 -0
  26. harvex-0.1.0/src/harvex/extras/llm/cache.py +132 -0
  27. harvex-0.1.0/src/harvex/extras/llm/enrich.py +89 -0
  28. harvex-0.1.0/src/harvex/extras/llm/schema.py +112 -0
  29. harvex-0.1.0/src/harvex/extras/llm/translator.py +215 -0
  30. harvex-0.1.0/src/harvex/extras/tui/__init__.py +1 -0
  31. harvex-0.1.0/src/harvex/extras/tui/panel.py +300 -0
  32. harvex-0.1.0/src/harvex/extras/tui/textwidth.py +111 -0
  33. harvex-0.1.0/src/harvex/extras/web/__init__.py +1 -0
  34. harvex-0.1.0/src/harvex/extras/web/assets/app.js +155 -0
  35. harvex-0.1.0/src/harvex/extras/web/assets/index.html +64 -0
  36. harvex-0.1.0/src/harvex/extras/web/assets/styles.css +116 -0
  37. harvex-0.1.0/src/harvex/extras/web/queries.py +138 -0
  38. harvex-0.1.0/src/harvex/extras/web/service.py +229 -0
  39. harvex-0.1.0/src/harvex/meta/__init__.py +18 -0
  40. harvex-0.1.0/src/harvex/meta/health.py +138 -0
  41. harvex-0.1.0/src/harvex/meta/metadata_db.py +297 -0
  42. harvex-0.1.0/src/harvex/net/__init__.py +1 -0
  43. harvex-0.1.0/src/harvex/net/http_client.py +197 -0
  44. harvex-0.1.0/src/harvex/net/retry.py +101 -0
  45. harvex-0.1.0/src/harvex/notify/__init__.py +12 -0
  46. harvex-0.1.0/src/harvex/notify/notifier.py +65 -0
  47. harvex-0.1.0/src/harvex/notify/webhook.py +200 -0
  48. harvex-0.1.0/src/harvex/obs/__init__.py +10 -0
  49. harvex-0.1.0/src/harvex/obs/logging.py +135 -0
  50. harvex-0.1.0/src/harvex/py.typed +0 -0
  51. harvex-0.1.0/src/harvex/scheduling/__init__.py +1 -0
  52. harvex-0.1.0/src/harvex/scheduling/cron.py +37 -0
  53. harvex-0.1.0/src/harvex/scheduling/launchd.py +87 -0
  54. harvex-0.1.0/src/harvex/storage/__init__.py +17 -0
  55. harvex-0.1.0/src/harvex/storage/database.py +326 -0
  56. harvex-0.1.0/src/harvex/storage/schema.py +89 -0
  57. harvex-0.1.0/src/harvex/storage/sink.py +38 -0
  58. harvex-0.1.0/src/harvex/storage/sqlite_sink.py +166 -0
  59. harvex-0.1.0/templates/project/.env.local.example +4 -0
  60. harvex-0.1.0/templates/project/config.toml +21 -0
  61. harvex-0.1.0/templates/project/fields.py +23 -0
  62. harvex-0.1.0/templates/project/pyproject.toml +18 -0
  63. harvex-0.1.0/templates/project/sources/__init__.py +1 -0
  64. harvex-0.1.0/templates/project/sources/example_source.py +39 -0
  65. harvex-0.1.0/tests/config/__init__.py +1 -0
  66. harvex-0.1.0/tests/config/test_settings.py +57 -0
  67. harvex-0.1.0/tests/conftest.py +65 -0
  68. harvex-0.1.0/tests/core/__init__.py +1 -0
  69. harvex-0.1.0/tests/core/test_pipeline_runner.py +90 -0
  70. harvex-0.1.0/tests/core/test_record.py +184 -0
  71. harvex-0.1.0/tests/core/test_registry.py +63 -0
  72. harvex-0.1.0/tests/extras/__init__.py +0 -0
  73. harvex-0.1.0/tests/extras/test_extras.py +64 -0
  74. harvex-0.1.0/tests/meta/__init__.py +1 -0
  75. harvex-0.1.0/tests/meta/test_metadata_health.py +55 -0
  76. harvex-0.1.0/tests/net/__init__.py +1 -0
  77. harvex-0.1.0/tests/net/test_http_client.py +70 -0
  78. harvex-0.1.0/tests/notify/__init__.py +1 -0
  79. harvex-0.1.0/tests/notify/test_webhook.py +44 -0
  80. harvex-0.1.0/tests/storage/__init__.py +1 -0
  81. harvex-0.1.0/tests/storage/test_sqlite_sink.py +46 -0
  82. harvex-0.1.0/uv.lock +751 -0
@@ -0,0 +1,25 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .venv/
6
+ .pytest_cache/
7
+ .coverage
8
+ htmlcov/
9
+ build/
10
+ dist/
11
+
12
+ # 本地数据与日志(框架自身仓库不存放运行产物)
13
+ logs/*.log
14
+ *.db
15
+ *.db-wal
16
+ *.db-shm
17
+ database_backup/
18
+
19
+ # 密钥
20
+ .env.local
21
+ .env
22
+
23
+ # 系统
24
+ .DS_Store
25
+ .playwright-mcp/
@@ -0,0 +1,39 @@
1
+ # Changelog
2
+
3
+ 本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。
4
+
5
+ ## [0.1.0] - 2026-06-24
6
+
7
+ 首个公开发布。AI 时代的数据采集基座,为 AI agent 与 vibecoding 设计。
8
+
9
+ ### 核心
10
+
11
+ - `HarvestRecord`(pydantic v2)字段收口契约:未声明字段自动折叠进 extra 列,脏数据写库前拦截。
12
+ - `BaseSource` + `SourceProfile` 类契约,`SourceRegistry` 扫描 `sources/` 自动发现。
13
+ - `Pipeline` 两阶段编排(并行采集 / 主线程串行落地,规避 SQLite 跨线程问题)。
14
+ - `run_sources` 并发统筹:故障隔离、轮级备份、汇总、失败/异常告警。
15
+
16
+ ### 存储 / 元数据 / 韧性
17
+
18
+ - SQLite Sink:建表/自动补列 + upsert 去重 + 轮级备份;`Sink` 抽象预留多出口扩展。
19
+ - 元数据流水库 + 数据健康检查(归零 / 骤降识别)。
20
+ - `httpx` 统一客户端 + `tenacity` 指数退避重试;stdlib 结构化日志;webhook 告警(Bark/飞书/Server酱)。
21
+
22
+ ### CLI / 调度
23
+
24
+ - `harvex` CLI:`list / run / health / gen-launchd / gen-cron / web / tui`。
25
+ - 生成 macOS launchd plist 与 crontab,调度与 Web 解耦。
26
+
27
+ ### 可选扩展(extras)
28
+
29
+ - `[web]`:stdlib http.server 只读浏览 UI(零第三方依赖)。
30
+ - `[llm]`:OpenAI 翻译 + 缓存 + 重试、一句话增强。
31
+ - `[tui]`:本地终端控制面板。
32
+ - `[browser]`:playwright 渲染型源基类。
33
+
34
+ ### 工程
35
+
36
+ - 42 个测试(零真实网络 / OpenAI / 浏览器)。
37
+ - `py.typed` 类型标记,核心零重依赖。
38
+
39
+ [0.1.0]: https://github.com/fangxuanxin/harvex/releases/tag/v0.1.0
harvex-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stephen Fang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
harvex-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: harvex
3
+ Version: 0.1.0
4
+ Summary: AI 时代的数据采集基座 —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架
5
+ Project-URL: Homepage, https://github.com/fangxuanxin/harvex
6
+ Project-URL: Repository, https://github.com/fangxuanxin/harvex
7
+ Project-URL: Issues, https://github.com/fangxuanxin/harvex/issues
8
+ Project-URL: Changelog, https://github.com/fangxuanxin/harvex/blob/main/CHANGELOG.md
9
+ Author-email: Stephen Fang <qazwsx80@gmail.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai-agent,crawler,data-harvesting,etl,framework,llm,pipeline,scraping,sqlite,vibecoding
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
21
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
22
+ Requires-Python: >=3.11
23
+ Requires-Dist: httpx>=0.27
24
+ Requires-Dist: pydantic-settings>=2.2
25
+ Requires-Dist: pydantic>=2.6
26
+ Requires-Dist: tenacity>=8.2
27
+ Provides-Extra: browser
28
+ Requires-Dist: playwright>=1.42; extra == 'browser'
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
31
+ Requires-Dist: pytest>=8.0; extra == 'dev'
32
+ Provides-Extra: llm
33
+ Requires-Dist: openai>=1.30; extra == 'llm'
34
+ Provides-Extra: tui
35
+ Provides-Extra: web
36
+ Description-Content-Type: text/markdown
37
+
38
+ # harvex
39
+
40
+ **AI 时代的数据采集基座** —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架。
41
+
42
+ 让 AI(或你自己 vibecoding)只需写「**怎么抓、怎么解析**」这一件事,其余全部交给框架:
43
+ 并发调度、字段收口、写库去重、元数据流水、HTTP 重试、日志、告警、数据健康检查、
44
+ 定时调度、Web 浏览、LLM 翻译增强、TUI 控制面板。
45
+
46
+ ```bash
47
+ pip install harvex # 核心零重依赖
48
+ pip install "harvex[web,llm,browser,tui]" # 按需启用扩展
49
+ ```
50
+
51
+ ## 为什么是「AI 时代的采集基座」
52
+
53
+ 让 LLM 写爬虫时,模型最擅长的是「这个页面/接口怎么解析成结构化数据」,最不擅长、
54
+ 也最容易写错的是周边工程:重试退避、并发隔离、增量去重、schema 漂移、调度、可观测。
55
+ harvex 把后者全部沉淀成稳定基座,给 AI 留下一个**极窄、极稳的契约面**:
56
+
57
+ ```python
58
+ from harvex import BaseSource, SourceProfile
59
+
60
+ class GithubTrending(BaseSource):
61
+ profile = SourceProfile(slug="gh_trending", name="GitHub Trending")
62
+
63
+ def fetch(self):
64
+ return self.ctx.http.get_json("https://api.example.com/trending")
65
+
66
+ def parse(self, raw):
67
+ for item in raw["items"]:
68
+ yield {"标题": item["name"], "star": item["stars"]}
69
+ ```
70
+
71
+ AI 只要产出这样一个类,`harvex run` 就能跑通采集→校验→去重→入库→流水→健康检查全链路。
72
+ 新增字段不会撑爆主表(自动折叠进 extra 列),一个源挂掉不影响整轮,脏数据写库前被拦截。
73
+
74
+ ## 设计原则
75
+
76
+ - **核心零重依赖**:core 只依赖 `pydantic` / `httpx` / `tenacity`。`playwright`、`openai`、Web、TUI 都是按需安装的 `extras`。
77
+ - **字段收口是框架契约**:用 `HarvestRecord`(pydantic v2)守住「不让主表变稀疏矩阵」的纪律,未声明字段自动折叠,脏数据写库前拦截。
78
+ - **故障隔离**:一个源挂掉不影响整轮抓取。
79
+ - **存储先 SQLite,签名预留 Sink 抽象**:开箱即用,又留好扩展接入点。
80
+ - **调度与 Web 解耦**:CLI + 系统 launchd/cron,不把定时寄生在 Web 进程里。
81
+
82
+ ## 分层
83
+
84
+ ```
85
+ sources/*.py (你/AI 写) BaseSource 子类:fetch() + parse()
86
+ ↓ raw → list[dict]
87
+ core/pipeline 校验(pydantic) → 收口(extra 折叠) → 写库 → 流水 → 健康检查
88
+
89
+ storage/sqlite_sink 建表/补列 + upsert 去重 + 轮级备份
90
+
91
+ SQLite 业务库 + 元信息库
92
+ ↓ (extras)
93
+ extras/web 只读浏览 extras/llm 翻译润色 extras/tui 控制面板
94
+ 统筹:core/runner(并发) + cli(harvex run / health / gen-launchd)
95
+ ```
96
+
97
+ ## 新项目骨架
98
+
99
+ ```
100
+ my_project/
101
+ ├── config.toml # 数据源开关/调度/筛选/通知
102
+ ├── .env.local # 密钥(openai key、webhook url)
103
+ ├── fields.py # 你的 HarvestRecord 子类 —— 标准字段
104
+ ├── sources/ # 一源一文件,只写 fetch/parse
105
+ └── database/ logs/
106
+ ```
107
+
108
+ 完整可跑模板见仓库 [`templates/project/`](templates/project)。
109
+
110
+ ## CLI
111
+
112
+ ```bash
113
+ harvex list # 列出已发现的数据源
114
+ harvex run --all # 跑一轮全部源
115
+ harvex run gh_trending # 跑指定源
116
+ harvex health # 数据健康检查(归零/骤降)
117
+ harvex gen-launchd # 生成 macOS launchd 定时配置
118
+ harvex gen-cron # 生成 crontab 行
119
+ harvex web # 启动只读浏览 UI(需 [web])
120
+ harvex tui # 启动本地控制面板(需 [tui])
121
+ ```
122
+
123
+ ## 开发
124
+
125
+ ```bash
126
+ uv venv && uv pip install -e ".[dev]"
127
+ uv run pytest
128
+ ```
129
+
130
+ ## License
131
+
132
+ MIT © Stephen Fang
harvex-0.1.0/README.md ADDED
@@ -0,0 +1,95 @@
1
+ # harvex
2
+
3
+ **AI 时代的数据采集基座** —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架。
4
+
5
+ 让 AI(或你自己 vibecoding)只需写「**怎么抓、怎么解析**」这一件事,其余全部交给框架:
6
+ 并发调度、字段收口、写库去重、元数据流水、HTTP 重试、日志、告警、数据健康检查、
7
+ 定时调度、Web 浏览、LLM 翻译增强、TUI 控制面板。
8
+
9
+ ```bash
10
+ pip install harvex # 核心零重依赖
11
+ pip install "harvex[web,llm,browser,tui]" # 按需启用扩展
12
+ ```
13
+
14
+ ## 为什么是「AI 时代的采集基座」
15
+
16
+ 让 LLM 写爬虫时,模型最擅长的是「这个页面/接口怎么解析成结构化数据」,最不擅长、
17
+ 也最容易写错的是周边工程:重试退避、并发隔离、增量去重、schema 漂移、调度、可观测。
18
+ harvex 把后者全部沉淀成稳定基座,给 AI 留下一个**极窄、极稳的契约面**:
19
+
20
+ ```python
21
+ from harvex import BaseSource, SourceProfile
22
+
23
+ class GithubTrending(BaseSource):
24
+ profile = SourceProfile(slug="gh_trending", name="GitHub Trending")
25
+
26
+ def fetch(self):
27
+ return self.ctx.http.get_json("https://api.example.com/trending")
28
+
29
+ def parse(self, raw):
30
+ for item in raw["items"]:
31
+ yield {"标题": item["name"], "star": item["stars"]}
32
+ ```
33
+
34
+ AI 只要产出这样一个类,`harvex run` 就能跑通采集→校验→去重→入库→流水→健康检查全链路。
35
+ 新增字段不会撑爆主表(自动折叠进 extra 列),一个源挂掉不影响整轮,脏数据写库前被拦截。
36
+
37
+ ## 设计原则
38
+
39
+ - **核心零重依赖**:core 只依赖 `pydantic` / `httpx` / `tenacity`。`playwright`、`openai`、Web、TUI 都是按需安装的 `extras`。
40
+ - **字段收口是框架契约**:用 `HarvestRecord`(pydantic v2)守住「不让主表变稀疏矩阵」的纪律,未声明字段自动折叠,脏数据写库前拦截。
41
+ - **故障隔离**:一个源挂掉不影响整轮抓取。
42
+ - **存储先 SQLite,签名预留 Sink 抽象**:开箱即用,又留好扩展接入点。
43
+ - **调度与 Web 解耦**:CLI + 系统 launchd/cron,不把定时寄生在 Web 进程里。
44
+
45
+ ## 分层
46
+
47
+ ```
48
+ sources/*.py (你/AI 写) BaseSource 子类:fetch() + parse()
49
+ ↓ raw → list[dict]
50
+ core/pipeline 校验(pydantic) → 收口(extra 折叠) → 写库 → 流水 → 健康检查
51
+
52
+ storage/sqlite_sink 建表/补列 + upsert 去重 + 轮级备份
53
+
54
+ SQLite 业务库 + 元信息库
55
+ ↓ (extras)
56
+ extras/web 只读浏览 extras/llm 翻译润色 extras/tui 控制面板
57
+ 统筹:core/runner(并发) + cli(harvex run / health / gen-launchd)
58
+ ```
59
+
60
+ ## 新项目骨架
61
+
62
+ ```
63
+ my_project/
64
+ ├── config.toml # 数据源开关/调度/筛选/通知
65
+ ├── .env.local # 密钥(openai key、webhook url)
66
+ ├── fields.py # 你的 HarvestRecord 子类 —— 标准字段
67
+ ├── sources/ # 一源一文件,只写 fetch/parse
68
+ └── database/ logs/
69
+ ```
70
+
71
+ 完整可跑模板见仓库 [`templates/project/`](templates/project)。
72
+
73
+ ## CLI
74
+
75
+ ```bash
76
+ harvex list # 列出已发现的数据源
77
+ harvex run --all # 跑一轮全部源
78
+ harvex run gh_trending # 跑指定源
79
+ harvex health # 数据健康检查(归零/骤降)
80
+ harvex gen-launchd # 生成 macOS launchd 定时配置
81
+ harvex gen-cron # 生成 crontab 行
82
+ harvex web # 启动只读浏览 UI(需 [web])
83
+ harvex tui # 启动本地控制面板(需 [tui])
84
+ ```
85
+
86
+ ## 开发
87
+
88
+ ```bash
89
+ uv venv && uv pip install -e ".[dev]"
90
+ uv run pytest
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT © Stephen Fang
@@ -0,0 +1,59 @@
1
+ [project]
2
+ name = "harvex"
3
+ version = "0.1.0"
4
+ description = "AI 时代的数据采集基座 —— 为 AI agent 与 vibecoding 打造的零样板数据采集框架"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "Stephen Fang", email = "qazwsx80@gmail.com" }]
9
+ keywords = [
10
+ "data-harvesting", "scraping", "crawler", "ai-agent", "vibecoding",
11
+ "sqlite", "pipeline", "etl", "framework", "llm",
12
+ ]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
21
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
22
+ "Topic :: Database",
23
+ ]
24
+ # 核心零重依赖:只保留 pydantic / httpx / tenacity 三件套
25
+ dependencies = [
26
+ "pydantic>=2.6",
27
+ "pydantic-settings>=2.2",
28
+ "httpx>=0.27",
29
+ "tenacity>=8.2",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ # 可选扩展:按需安装,核心永不强拉这些重依赖
34
+ browser = ["playwright>=1.42"]
35
+ llm = ["openai>=1.30"]
36
+ web = [] # 只读浏览 UI 走 stdlib http.server,无第三方依赖
37
+ tui = [] # 本地控制面板走 stdlib,无第三方依赖
38
+ dev = ["pytest>=8.0", "pytest-cov>=5.0"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/fangxuanxin/harvex"
42
+ Repository = "https://github.com/fangxuanxin/harvex"
43
+ Issues = "https://github.com/fangxuanxin/harvex/issues"
44
+ Changelog = "https://github.com/fangxuanxin/harvex/blob/main/CHANGELOG.md"
45
+
46
+ [project.scripts]
47
+ # CLI 入口:harvex run / list / health / gen-launchd / web / tui
48
+ harvex = "harvex.cli.main:main"
49
+
50
+ [build-system]
51
+ requires = ["hatchling"]
52
+ build-backend = "hatchling.build"
53
+
54
+ [tool.hatch.build.targets.wheel]
55
+ packages = ["src/harvex"]
56
+
57
+ [tool.pytest.ini_options]
58
+ testpaths = ["tests"]
59
+ addopts = "-q"
@@ -0,0 +1,50 @@
1
+ """harvex:可本地复用的数据采集框架库。
2
+
3
+ 公开 API —— 下游项目只需从这里导入:
4
+
5
+ from harvex import BaseSource, SourceProfile, HarvestRecord, run_sources
6
+
7
+ 核心契约(record/source/context/errors/sink)始终可用;
8
+ runner/pipeline 等编排能力在对应模块就绪后由本文件统一导出。
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .core.context import SourceContext
14
+ from .core.errors import (
15
+ ConfigError,
16
+ FetchError,
17
+ HarvexError,
18
+ ParseError,
19
+ RecordValidationError,
20
+ SinkError,
21
+ )
22
+ from .core.pipeline import Pipeline, SourceResult
23
+ from .core.record import HarvestRecord
24
+ from .core.registry import SourceRegistry
25
+ from .core.runner import RunReport, run_sources
26
+ from .core.source import BaseSource, SourceProfile
27
+ from .storage.sink import Sink, WriteResult
28
+
29
+ __version__ = "0.1.0"
30
+
31
+ __all__ = [
32
+ "BaseSource",
33
+ "SourceProfile",
34
+ "SourceContext",
35
+ "HarvestRecord",
36
+ "SourceRegistry",
37
+ "Pipeline",
38
+ "SourceResult",
39
+ "run_sources",
40
+ "RunReport",
41
+ "Sink",
42
+ "WriteResult",
43
+ "HarvexError",
44
+ "ConfigError",
45
+ "FetchError",
46
+ "ParseError",
47
+ "RecordValidationError",
48
+ "SinkError",
49
+ "__version__",
50
+ ]
@@ -0,0 +1 @@
1
+ """harvex 子模块。"""
@@ -0,0 +1,238 @@
1
+ """harvex 命令行入口。
2
+
3
+ 把配置、源发现、各层组装成可运行的应用,并暴露子命令:
4
+
5
+ harvex list 列出已发现的数据源
6
+ harvex run --all 跑一轮全部启用的源
7
+ harvex run icbc apple 跑指定源
8
+ harvex health 数据健康检查(归零/骤降)
9
+ harvex gen-launchd 生成 macOS launchd plist
10
+ harvex gen-cron 生成 crontab 行
11
+ harvex web 启动只读浏览 UI(需 [web])
12
+ harvex tui 启动本地控制面板(需 [tui])
13
+
14
+ 约定:在下游项目根目录(含 config.toml / sources/)运行,或用 --project 指定。
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import sys
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+
24
+ from ..config.settings import Settings, load_settings
25
+ from ..core.pipeline import Pipeline
26
+ from ..core.record import HarvestRecord
27
+ from ..core.registry import SourceRegistry
28
+ from ..core.runner import run_sources
29
+ from ..meta.health import scan_health
30
+ from ..meta.metadata_db import MetadataDatabase
31
+ from ..net.http_client import HttpClient
32
+ from ..net.retry import RetryPolicy
33
+ from ..notify.webhook import WebhookNotifier
34
+ from ..obs.logging import get_logger, setup_logging
35
+ from ..storage.sqlite_sink import SQLiteSink
36
+
37
+
38
+ @dataclass
39
+ class App:
40
+ """组装好的运行期应用:配置 + 注册表 + 各层 + pipeline。"""
41
+
42
+ settings: Settings
43
+ registry: SourceRegistry
44
+ pipeline: Pipeline
45
+ sink: SQLiteSink
46
+ meta: MetadataDatabase
47
+ http: HttpClient
48
+
49
+ def close(self) -> None:
50
+ for closer in (self.http, self.meta, self.sink):
51
+ try:
52
+ closer.close()
53
+ except Exception: # noqa: BLE001 关闭尽力而为
54
+ pass
55
+
56
+
57
+ def _resolve_record_model(registry: SourceRegistry) -> type[HarvestRecord]:
58
+ """从已发现的源里取业务记录模型(本项目所有源共用一个业务表/模型)。"""
59
+ models = {cls.record_model for cls in registry.all()}
60
+ if not models:
61
+ return HarvestRecord
62
+ if len(models) > 1:
63
+ get_logger().warning("发现多个记录模型,使用首个:%s", sorted(m.__name__ for m in models))
64
+ return registry.all()[0].record_model
65
+
66
+
67
+ def build_app(project_dir: str | Path) -> App:
68
+ """从项目目录组装应用:加载配置 → 发现源 → 构建各层 → pipeline。"""
69
+ project_dir = Path(project_dir).resolve()
70
+ settings = load_settings(project_dir)
71
+ setup_logging(settings.log_dir)
72
+
73
+ registry = SourceRegistry()
74
+ sources_dir = project_dir / "sources"
75
+ if sources_dir.is_dir():
76
+ registry.discover_dir(sources_dir)
77
+
78
+ record_model = _resolve_record_model(registry)
79
+ http = HttpClient(
80
+ timeout=settings.http.timeout,
81
+ user_agent=settings.http.user_agent,
82
+ retry=RetryPolicy(attempts=settings.http.retry_attempts),
83
+ )
84
+ # 写库的轮级备份由 runner 统一触发,故 sink 自身关掉每写即备份
85
+ sink = SQLiteSink(
86
+ settings.storage.db_path, record_model,
87
+ table=settings.storage.table, backup=False, backup_keep=settings.storage.backup_keep,
88
+ )
89
+ meta = MetadataDatabase(settings.storage.meta_db_path)
90
+ notifier = WebhookNotifier.from_config({"url": settings.notify.url, "kind": settings.notify.kind})
91
+ source_config = {slug: dict(sc.options, **{"schedule": sc.schedule, "channel": sc.channel})
92
+ for slug, sc in settings.sources.items()}
93
+ pipeline = Pipeline(
94
+ sink=sink, meta=meta, http=http,
95
+ drop_ratio=settings.drop_ratio, notifier=notifier, source_config=source_config,
96
+ )
97
+ return App(settings=settings, registry=registry, pipeline=pipeline, sink=sink, meta=meta, http=http)
98
+
99
+
100
+ def _select_sources(app: App, slugs: list[str], run_all: bool):
101
+ """根据命令行参数挑选要跑的源(兼顾 config 的 enabled 软开关)。"""
102
+ settings = app.settings
103
+ if run_all or not slugs:
104
+ chosen = []
105
+ for cls in app.registry.enabled():
106
+ sc = settings.sources.get(cls.profile.slug)
107
+ if sc is not None and not sc.enabled:
108
+ continue # config 显式禁用
109
+ chosen.append(cls)
110
+ return chosen
111
+ return [app.registry.get(s) for s in slugs]
112
+
113
+
114
+ # ---------------- 子命令 ----------------
115
+
116
+ def cmd_list(app: App, args) -> int:
117
+ if not app.registry.all():
118
+ print("(未发现任何数据源;确认 sources/ 目录存在且含 BaseSource 子类)")
119
+ return 0
120
+ print(f"已发现 {len(app.registry.all())} 个数据源:")
121
+ for cls in app.registry.all():
122
+ p = cls.profile
123
+ sc = app.settings.sources.get(p.slug)
124
+ enabled = (sc.enabled if sc else p.enabled)
125
+ flag = "✓" if enabled else "✗"
126
+ print(f" [{flag}] {p.slug:<16} {p.name} (channel={p.channel or '-'}, schedule={p.schedule or '-'})")
127
+ return 0
128
+
129
+
130
+ def cmd_run(app: App, args) -> int:
131
+ sources = _select_sources(app, args.slugs, args.all)
132
+ if not sources:
133
+ print("没有可运行的源。")
134
+ return 1
135
+ report = run_sources(sources, app.pipeline, max_workers=args.workers)
136
+ print("\n" + report.summary_line())
137
+ for r in report.results:
138
+ mark = {"success": "✓", "anomaly": "!", "failed": "✗"}.get(r.status, "?")
139
+ extra = r.error or (r.health.reason if r.health else "")
140
+ print(f" [{mark}] {r.slug:<16} 收到{r.received} 新增{r.inserted} 更新{r.updated} 总{r.total_after} {extra}")
141
+ return 0 if not report.failed else 2
142
+
143
+
144
+ def cmd_health(app: App, args) -> int:
145
+ statuses = scan_health(app.meta, drop_ratio=app.settings.drop_ratio)
146
+ if not statuses:
147
+ print("(暂无抓取历史可供体检)")
148
+ return 0
149
+ anomalies = [s for s in statuses if s.status == "anomaly"]
150
+ for s in statuses:
151
+ mark = "!" if s.status == "anomaly" else "✓"
152
+ print(f" [{mark}] {s.source_slug:<16} 当前{s.current} 上轮{s.previous} {s.reason}")
153
+ return 0 if not anomalies else 2
154
+
155
+
156
+ def cmd_gen_launchd(app: App, args) -> int:
157
+ from ..scheduling.launchd import generate_launchd_plist, install_hint
158
+ times = args.at.split(",") if args.at else ["10:00", "17:00"]
159
+ label = args.label or "com.harvex"
160
+ print(generate_launchd_plist(label=label, project_dir=app.settings.project_dir, times=times))
161
+ print("\n" + install_hint(label), file=sys.stderr)
162
+ return 0
163
+
164
+
165
+ def cmd_gen_cron(app: App, args) -> int:
166
+ from ..scheduling.cron import generate_cron_line
167
+ times = args.at.split(",") if args.at else ["10:00", "17:00"]
168
+ print(generate_cron_line(project_dir=app.settings.project_dir, times=times))
169
+ return 0
170
+
171
+
172
+ def cmd_web(app: App, args) -> int:
173
+ try:
174
+ from ..extras.web.service import serve
175
+ except ImportError as error:
176
+ print(f"Web 扩展不可用:{error}", file=sys.stderr)
177
+ return 1
178
+ serve(app, host=args.host, port=args.port)
179
+ return 0
180
+
181
+
182
+ def cmd_tui(app: App, args) -> int:
183
+ try:
184
+ from ..extras.tui.panel import run_panel
185
+ except ImportError as error:
186
+ print(f"TUI 扩展不可用:{error}", file=sys.stderr)
187
+ return 1
188
+ run_panel(app)
189
+ return 0
190
+
191
+
192
+ def build_parser() -> argparse.ArgumentParser:
193
+ parser = argparse.ArgumentParser(prog="harvex", description="本地数据采集框架 CLI")
194
+ parser.add_argument("--project", default=".", help="项目目录(默认当前目录)")
195
+ sub = parser.add_subparsers(dest="command", required=True)
196
+
197
+ sub.add_parser("list", help="列出数据源")
198
+
199
+ p_run = sub.add_parser("run", help="跑一轮抓取")
200
+ p_run.add_argument("slugs", nargs="*", help="指定源 slug;省略则跑全部")
201
+ p_run.add_argument("--all", action="store_true", help="跑全部启用的源")
202
+ p_run.add_argument("--workers", type=int, default=4, help="并发数")
203
+
204
+ sub.add_parser("health", help="数据健康检查")
205
+
206
+ p_ld = sub.add_parser("gen-launchd", help="生成 launchd plist")
207
+ p_ld.add_argument("--at", help="触发时间点,逗号分隔,如 10:00,17:00")
208
+ p_ld.add_argument("--label", help="launchd Label")
209
+
210
+ p_cron = sub.add_parser("gen-cron", help="生成 crontab 行")
211
+ p_cron.add_argument("--at", help="触发时间点,逗号分隔")
212
+
213
+ p_web = sub.add_parser("web", help="启动只读浏览 UI")
214
+ p_web.add_argument("--host", default="127.0.0.1")
215
+ p_web.add_argument("--port", type=int, default=8765)
216
+
217
+ sub.add_parser("tui", help="启动本地控制面板")
218
+ return parser
219
+
220
+
221
+ _HANDLERS = {
222
+ "list": cmd_list, "run": cmd_run, "health": cmd_health,
223
+ "gen-launchd": cmd_gen_launchd, "gen-cron": cmd_gen_cron,
224
+ "web": cmd_web, "tui": cmd_tui,
225
+ }
226
+
227
+
228
+ def main(argv: list[str] | None = None) -> int:
229
+ args = build_parser().parse_args(argv)
230
+ app = build_app(args.project)
231
+ try:
232
+ return _HANDLERS[args.command](app, args)
233
+ finally:
234
+ app.close()
235
+
236
+
237
+ if __name__ == "__main__":
238
+ raise SystemExit(main())