PyPI - linkedin-horse - Versions diffs - 0.1.0__tar.gz - Mend

linkedin-horse 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

linkedin_horse-0.1.0/PKG-INFO +222 -0
linkedin_horse-0.1.0/README.md +195 -0
linkedin_horse-0.1.0/linkedin_horse/__init__.py +3 -0
linkedin_horse-0.1.0/linkedin_horse/browser.py +84 -0
linkedin_horse-0.1.0/linkedin_horse/cli.py +249 -0
linkedin_horse-0.1.0/linkedin_horse/cookies.py +74 -0
linkedin_horse-0.1.0/linkedin_horse/export.py +135 -0
linkedin_horse-0.1.0/linkedin_horse/extractor.py +145 -0
linkedin_horse-0.1.0/linkedin_horse/output.py +46 -0
linkedin_horse-0.1.0/linkedin_horse.egg-info/PKG-INFO +222 -0
linkedin_horse-0.1.0/linkedin_horse.egg-info/SOURCES.txt +15 -0
linkedin_horse-0.1.0/linkedin_horse.egg-info/dependency_links.txt +1 -0
linkedin_horse-0.1.0/linkedin_horse.egg-info/entry_points.txt +2 -0
linkedin_horse-0.1.0/linkedin_horse.egg-info/requires.txt +9 -0
linkedin_horse-0.1.0/linkedin_horse.egg-info/top_level.txt +1 -0
linkedin_horse-0.1.0/pyproject.toml +43 -0
linkedin_horse-0.1.0/setup.cfg +4 -0

linkedin_horse-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,222 @@
+Metadata-Version: 2.4
+Name: linkedin-horse
+Version: 0.1.0
+Summary: LinkedIn 搜索结果资料提取工具 — 自动化抓取、解析、导出
+Author: linkedin-horse contributors
+License-Expression: MIT
+Keywords: linkedin,scraper,cli,typer,rich
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Utilities
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: typer>=0.9.0
+Requires-Dist: rich>=13.0.0
+Requires-Dist: beautifulsoup4>=4.12.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: openpyxl>=3.1.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: linkedin-cat
+Requires-Dist: llmdog
+Requires-Dist: larkfunc
+# linkedin-horse
+LinkedIn 搜索结果资料自动提取工具 — 基于 Selenium 自动化浏览器，从 LinkedIn 搜索页批量抓取个人资料数据，支持分页采集、自动去重、JSON 中间存储和 Excel 最终导出。
+## 功能特性
+- **自动化采集**：基于 Selenium 驱动 Chrome 浏览器，模拟真实用户操作
+- **智能解析**：多策略 HTML 解析，兼容 LinkedIn 新旧版页面结构
+- **分页抓取**：支持指定起止页码，逐页自动翻页采集
+- **自动去重**：基于 `profile_url` 自动过滤重复记录，支持增量采集
+- **双格式存储**：每页数据实时保存为 JSON 文件，最终合并导出 Excel
+- **重试机制**：每页最多 3 次重试，网络波动不丢数据
+- **现代 CLI**：基于 Typer + Rich 构建，彩色输出、进度条、配置面板
+- **Cookie 引导**：首次运行自动检测 Cookie 文件，提供详细的导出操作指引
+## 安装和环境配置
+### 安装
+```bash
+pip install linkedin-horse
+```
+### 环境要求
+- Python >= 3.9
+- Chrome 浏览器（用于 Selenium 驱动）
+- ChromeDriver（版本需与 Chrome 匹配）
+### Cookie 配置（首次使用必读）
+linkedin-horse 需要你的 LinkedIn 登录态 Cookie 来访问搜索结果。首次运行时，程序会自动检测并提示你配置。
+**操作步骤：**
+1. **安装 EditThisCookie 插件**
+   打开 Chrome 浏览器，访问 [Chrome Web Store](https://chrome.google.com/webstore)，搜索 "EditThisCookie" 并安装。
+2. **登录 LinkedIn**
+   在 Chrome 中访问 https://www.linkedin.com 并登录你的账号，确保页面正常显示首页 Feed。
+3. **导出 Cookies**
+   点击浏览器右上角的 EditThisCookie 插件图标（饼干形状），在弹出窗口中点击"导出"按钮。
+4. **保存文件**
+   新建文本文件，粘贴剪贴板内容，保存为 `linkedin_cookies.json`，放置在程序运行目录下。
+5. **验证**
+   确保文件是合法的 JSON 数组格式（以 `[` 开头，以 `]` 结尾）。
+## 使用示例
+### 基本用法
+```bash
+# 从 LinkedIn 搜索页提取数据（第 1-5 页）
+linkedin-horse extract \
+  --base-url "https://www.linkedin.com/search/results/people/?keywords=python%20developer&origin=GLOBAL_SEARCH_HEADER" \
+  --search-keyword "python_developer" \
+  --start-page 1 \
+  --end-page 5
+```
+### 完整参数示例
+```bash
+linkedin-horse extract \
+  --base-url "https://www.linkedin.com/search/results/people/?keywords=data%20engineer" \
+  --search-keyword "data_engineer" \
+  --start-page 1 \
+  --end-page 20 \
+  --headless \
+  --cookies-json ./my_cookies.json \
+  --max-retries 5 \
+  --retry-delay 10
+```
+### 输出结构
+运行后会生成以下文件结构：
+```
+./
+├── data_engineer/                    # 以 search_keyword 命名的数据目录
+│   ├── data_engineer_1.json          # 第 1 页数据
+│   ├── data_engineer_2.json          # 第 2 页数据
+│   └── ...
+└── data_engineer.xlsx                # 最终合并的 Excel 文件
+```
+### 查看帮助
+```bash
+linkedin-horse --help
+linkedin-horse extract --help
+```
+## API 接口说明
+linkedin-horse 采用模块化设计，核心模块可独立调用：
+### extractor 模块
+```python
+from linkedin_horse.extractor import extract_profile_data_from_page
+# 传入 HTML 源码，返回个人资料字典列表
+profiles = extract_profile_data_from_page(html_source)
+```
+### export 模块
+```python
+from linkedin_horse.export import save_page_json, merge_json_to_excel
+from pathlib import Path
+# 保存单页数据为 JSON
+save_page_json(profiles, Path("my_search"), "my_search", page=1)
+# 合并所有 JSON 为 Excel
+merge_json_to_excel(Path("my_search"), Path("my_search.xlsx"))
+```
+### browser 模块
+```python
+from linkedin_horse.browser import init_browser, fetch_page_with_retry, close_browser
+from pathlib import Path
+bot, driver = init_browser(Path("linkedin_cookies.json"), headless=True)
+html = fetch_page_with_retry(driver, url, page_num=1)
+close_browser(driver)
+```
+### cookies 模块
+```python
+from linkedin_horse.cookies import check_cookies
+from pathlib import Path
+# 检查 Cookie 文件，不存在则输出指引并退出
+check_cookies(Path("linkedin_cookies.json"))
+```
+## 依赖项清单
+| 依赖 | 用途 |
+|------|------|
+| `typer` | CLI 框架 |
+| `rich` | 终端美化输出 |
+| `beautifulsoup4` | HTML 解析 |
+| `pandas` | 数据处理与 Excel 导出 |
+| `openpyxl` | Excel 文件引擎 |
+| `python-dotenv` | 环境变量加载 |
+| `linkedin-cat` | LinkedIn 浏览器自动化 |
+| `llmdog` | LLM 调用封装 |
+| `larkfunc` | 通用工具函数库 |
+## 技术架构
+```
+linkedin_horse/
+├── cli.py          # Typer CLI 入口，参数解析与流程编排
+├── output.py       # Rich 统一输出模块（主题、彩色打印函数）
+├── cookies.py      # Cookie 文件检查与用户操作指引
+├── extractor.py    # HTML 解析与个人资料数据提取（核心逻辑）
+├── browser.py      # 浏览器初始化与页面获取（含重试机制）
+└── export.py       # JSON 分页存储 + Excel 合并导出
+```
+**数据流**：`搜索 URL → 逐页抓取 HTML → 解析提取 → JSON 分页保存 → Excel 合并导出`
+各模块职责单一、接口清晰，便于后续扩展（如增加新的解析策略、输出格式等）。
+## 贡献指南与许可证
+### 贡献
+1. Fork 本仓库
+2. 创建功能分支 (`git checkout -b feature/my-feature`)
+3. 提交更改 (`git commit -m 'Add my feature'`)
+4. 推送到分支 (`git push origin feature/my-feature`)
+5. 创建 Pull Request
+### 许可证
+本项目基于 MIT 许可证开源。
+### 免责声明
+- 本工具仅供学习和研究用途，使用者需自行承担使用风险
+- 使用本工具前请确保遵守 LinkedIn 的服务条款和使用政策
+- 过度频繁的自动化访问可能导致账号被限制，请合理控制采集频率
+- 开发者不对因使用本工具产生的任何后果承担责任
+- 请尊重他人隐私，合法合规地使用采集到的数据

linkedin_horse-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,195 @@
+# linkedin-horse
+LinkedIn 搜索结果资料自动提取工具 — 基于 Selenium 自动化浏览器，从 LinkedIn 搜索页批量抓取个人资料数据，支持分页采集、自动去重、JSON 中间存储和 Excel 最终导出。
+## 功能特性
+- **自动化采集**：基于 Selenium 驱动 Chrome 浏览器，模拟真实用户操作
+- **智能解析**：多策略 HTML 解析，兼容 LinkedIn 新旧版页面结构
+- **分页抓取**：支持指定起止页码，逐页自动翻页采集
+- **自动去重**：基于 `profile_url` 自动过滤重复记录，支持增量采集
+- **双格式存储**：每页数据实时保存为 JSON 文件，最终合并导出 Excel
+- **重试机制**：每页最多 3 次重试，网络波动不丢数据
+- **现代 CLI**：基于 Typer + Rich 构建，彩色输出、进度条、配置面板
+- **Cookie 引导**：首次运行自动检测 Cookie 文件，提供详细的导出操作指引
+## 安装和环境配置
+### 安装
+```bash
+pip install linkedin-horse
+```
+### 环境要求
+- Python >= 3.9
+- Chrome 浏览器（用于 Selenium 驱动）
+- ChromeDriver（版本需与 Chrome 匹配）
+### Cookie 配置（首次使用必读）
+linkedin-horse 需要你的 LinkedIn 登录态 Cookie 来访问搜索结果。首次运行时，程序会自动检测并提示你配置。
+**操作步骤：**
+1. **安装 EditThisCookie 插件**
+   打开 Chrome 浏览器，访问 [Chrome Web Store](https://chrome.google.com/webstore)，搜索 "EditThisCookie" 并安装。
+2. **登录 LinkedIn**
+   在 Chrome 中访问 https://www.linkedin.com 并登录你的账号，确保页面正常显示首页 Feed。
+3. **导出 Cookies**
+   点击浏览器右上角的 EditThisCookie 插件图标（饼干形状），在弹出窗口中点击"导出"按钮。
+4. **保存文件**
+   新建文本文件，粘贴剪贴板内容，保存为 `linkedin_cookies.json`，放置在程序运行目录下。
+5. **验证**
+   确保文件是合法的 JSON 数组格式（以 `[` 开头，以 `]` 结尾）。
+## 使用示例
+### 基本用法
+```bash
+# 从 LinkedIn 搜索页提取数据（第 1-5 页）
+linkedin-horse extract \
+  --base-url "https://www.linkedin.com/search/results/people/?keywords=python%20developer&origin=GLOBAL_SEARCH_HEADER" \
+  --search-keyword "python_developer" \
+  --start-page 1 \
+  --end-page 5
+```
+### 完整参数示例
+```bash
+linkedin-horse extract \
+  --base-url "https://www.linkedin.com/search/results/people/?keywords=data%20engineer" \
+  --search-keyword "data_engineer" \
+  --start-page 1 \
+  --end-page 20 \
+  --headless \
+  --cookies-json ./my_cookies.json \
+  --max-retries 5 \
+  --retry-delay 10
+```
+### 输出结构
+运行后会生成以下文件结构：
+```
+./
+├── data_engineer/                    # 以 search_keyword 命名的数据目录
+│   ├── data_engineer_1.json          # 第 1 页数据
+│   ├── data_engineer_2.json          # 第 2 页数据
+│   └── ...
+└── data_engineer.xlsx                # 最终合并的 Excel 文件
+```
+### 查看帮助
+```bash
+linkedin-horse --help
+linkedin-horse extract --help
+```
+## API 接口说明
+linkedin-horse 采用模块化设计，核心模块可独立调用：
+### extractor 模块
+```python
+from linkedin_horse.extractor import extract_profile_data_from_page
+# 传入 HTML 源码，返回个人资料字典列表
+profiles = extract_profile_data_from_page(html_source)
+```
+### export 模块
+```python
+from linkedin_horse.export import save_page_json, merge_json_to_excel
+from pathlib import Path
+# 保存单页数据为 JSON
+save_page_json(profiles, Path("my_search"), "my_search", page=1)
+# 合并所有 JSON 为 Excel
+merge_json_to_excel(Path("my_search"), Path("my_search.xlsx"))
+```
+### browser 模块
+```python
+from linkedin_horse.browser import init_browser, fetch_page_with_retry, close_browser
+from pathlib import Path
+bot, driver = init_browser(Path("linkedin_cookies.json"), headless=True)
+html = fetch_page_with_retry(driver, url, page_num=1)
+close_browser(driver)
+```
+### cookies 模块
+```python
+from linkedin_horse.cookies import check_cookies
+from pathlib import Path
+# 检查 Cookie 文件，不存在则输出指引并退出
+check_cookies(Path("linkedin_cookies.json"))
+```
+## 依赖项清单
+| 依赖 | 用途 |
+|------|------|
+| `typer` | CLI 框架 |
+| `rich` | 终端美化输出 |
+| `beautifulsoup4` | HTML 解析 |
+| `pandas` | 数据处理与 Excel 导出 |
+| `openpyxl` | Excel 文件引擎 |
+| `python-dotenv` | 环境变量加载 |
+| `linkedin-cat` | LinkedIn 浏览器自动化 |
+| `llmdog` | LLM 调用封装 |
+| `larkfunc` | 通用工具函数库 |
+## 技术架构
+```
+linkedin_horse/
+├── cli.py          # Typer CLI 入口，参数解析与流程编排
+├── output.py       # Rich 统一输出模块（主题、彩色打印函数）
+├── cookies.py      # Cookie 文件检查与用户操作指引
+├── extractor.py    # HTML 解析与个人资料数据提取（核心逻辑）
+├── browser.py      # 浏览器初始化与页面获取（含重试机制）
+└── export.py       # JSON 分页存储 + Excel 合并导出
+```
+**数据流**：`搜索 URL → 逐页抓取 HTML → 解析提取 → JSON 分页保存 → Excel 合并导出`
+各模块职责单一、接口清晰，便于后续扩展（如增加新的解析策略、输出格式等）。
+## 贡献指南与许可证
+### 贡献
+1. Fork 本仓库
+2. 创建功能分支 (`git checkout -b feature/my-feature`)
+3. 提交更改 (`git commit -m 'Add my feature'`)
+4. 推送到分支 (`git push origin feature/my-feature`)
+5. 创建 Pull Request
+### 许可证
+本项目基于 MIT 许可证开源。
+### 免责声明
+- 本工具仅供学习和研究用途，使用者需自行承担使用风险
+- 使用本工具前请确保遵守 LinkedIn 的服务条款和使用政策
+- 过度频繁的自动化访问可能导致账号被限制，请合理控制采集频率
+- 开发者不对因使用本工具产生的任何后果承担责任
+- 请尊重他人隐私，合法合规地使用采集到的数据

linkedin_horse-0.1.0/linkedin_horse/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""linkedin-horse: LinkedIn 搜索结果资料提取工具"""
+__version__ = "0.1.0"

linkedin_horse-0.1.0/linkedin_horse/browser.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+linkedin_horse.browser
+浏览器初始化与页面获取（含重试机制）
+"""
+import time
+from pathlib import Path
+from typing import Optional
+import typer
+from linkedin_cat.message import LinkedinMessage
+from linkedin_horse.output import console, print_error, print_info
+def init_browser(cookies_json: Path, headless: bool = False):
+    """
+    初始化 LinkedIn 浏览器实例。
+    Args:
+        cookies_json: cookies JSON 文件路径
+        headless: 是否使用无头模式
+    Returns:
+        (bot, driver) 元组
+    Raises:
+        typer.Exit: 初始化失败时终止程序
+    """
+    try:
+        bot = LinkedinMessage(str(cookies_json), headless)
+        driver = bot.driver
+        print_info("浏览器已启动")
+        return bot, driver
+    except Exception as e:
+        print_error(f"初始化浏览器失败: {e}")
+        raise typer.Exit(code=1)
+def close_browser(driver) -> None:
+    """安全关闭浏览器"""
+    try:
+        driver.quit()
+        print_info("浏览器已关闭")
+    except Exception:
+        pass
+def fetch_page_with_retry(
+    driver,
+    url: str,
+    page_num: int,
+    max_retries: int = 3,
+    delay: int = 5,
+) -> Optional[str]:
+    """
+    获取页面 HTML 源码，带重试机制。
+    Args:
+        driver: Selenium WebDriver 实例
+        url: 目标页面 URL
+        page_num: 当前页码（用于日志显示）
+        max_retries: 最大重试次数
+        delay: 重试间隔秒数
+    Returns:
+        页面 HTML 源码，失败返回 None
+    """
+    for attempt in range(1, max_retries + 1):
+        try:
+            console.log(f"[dim]页面 {page_num} 尝试 {attempt}/{max_retries}[/dim]")
+            driver.get(url)
+            time.sleep(6)  # 等待动态内容加载
+            return driver.page_source
+        except Exception as e:
+            print_error(f"页面 {page_num} 访问失败 (尝试 {attempt}/{max_retries}): {e}")
+            if attempt < max_retries:
+                console.log(f"[dim]{delay} 秒后重试...[/dim]")
+                time.sleep(delay)
+            else:
+                print_error(f"页面 {page_num} 已达最大重试次数，跳过")
+                return None
+    return None