mcp-query-table 0.3.5__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/PKG-INFO +19 -10
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/README.md +18 -10
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/__init__.py +4 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/__main__.py +10 -6
- mcp_query_table-0.3.7/mcp_query_table/_version.py +1 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/enums.py +1 -1
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/providers/baidu.py +1 -1
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/providers/n.py +5 -1
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/providers/yuanbao.py +4 -1
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/server.py +19 -13
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/sites/iwencai.py +5 -1
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/tool.py +88 -58
- mcp_query_table-0.3.7/mcp_query_table/utils.py +51 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table.egg-info/PKG-INFO +19 -10
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table.egg-info/SOURCES.txt +1 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table.egg-info/requires.txt +1 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/pyproject.toml +1 -0
- mcp_query_table-0.3.5/mcp_query_table/_version.py +0 -1
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/LICENSE +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/providers/__init__.py +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/sites/__init__.py +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/sites/eastmoney.py +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table/sites/tdx.py +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table.egg-info/dependency_links.txt +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table.egg-info/top_level.txt +0 -0
- {mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcp_query_table
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: query table from website, support MCP
|
|
5
5
|
Author-email: wukan <wu-kan@163.com>
|
|
6
6
|
License: MIT License
|
|
@@ -34,6 +34,7 @@ License-File: LICENSE
|
|
|
34
34
|
Requires-Dist: pandas
|
|
35
35
|
Requires-Dist: loguru
|
|
36
36
|
Requires-Dist: playwright
|
|
37
|
+
Requires-Dist: playwright-stealth
|
|
37
38
|
Requires-Dist: mcp
|
|
38
39
|
Dynamic: license-file
|
|
39
40
|
|
|
@@ -41,7 +42,7 @@ Dynamic: license-file
|
|
|
41
42
|
|
|
42
43
|
1. 基于`playwright`实现的财经网页表格爬虫,支持`Model Context Protocol (MCP) `。目前可查询来源为
|
|
43
44
|
|
|
44
|
-
- [
|
|
45
|
+
- [同花顺问财](http://iwencai.com/)
|
|
45
46
|
- [通达信问小达](https://wenda.tdx.com.cn/)
|
|
46
47
|
- [东方财富条件选股](https://xuangu.eastmoney.com/)
|
|
47
48
|
|
|
@@ -70,7 +71,7 @@ from mcp_query_table import *
|
|
|
70
71
|
|
|
71
72
|
|
|
72
73
|
async def main() -> None:
|
|
73
|
-
async with BrowserManager(
|
|
74
|
+
async with BrowserManager(endpoint="http://127.0.0.1:9222", executable_path=None, devtools=True) as bm:
|
|
74
75
|
# 问财需要保证浏览器宽度>768,防止界面变成适应手机
|
|
75
76
|
page = await bm.get_page()
|
|
76
77
|
df = await query(page, '收益最好的200只ETF', query_type=QueryType.ETF, max_page=1, site=Site.THS)
|
|
@@ -128,12 +129,21 @@ if __name__ == '__main__':
|
|
|
128
129
|
|
|
129
130
|
后期会根据不同的网站改版情况,使用更适合的方法。
|
|
130
131
|
|
|
132
|
+
## 无头模式
|
|
133
|
+
|
|
134
|
+
无头模式运行速度更快,但部分网站需要提前登录,所以,无头模式一定要指定`user_data_dir`,否则会出现需要登录的情况。
|
|
135
|
+
|
|
136
|
+
- `endpoint=None`时,`headless=True`可无头启动新浏览器实例。指定`executable_path`和`user_data_dir`,才能确保无头模式下正常运行。
|
|
137
|
+
- `endpoint`以`http://`开头,连接`CDP`模式启动的有头浏览器,参数必有`--remote-debugging-port`。`executable_path`为本地浏览器路径。
|
|
138
|
+
- `endpoint`以`ws://`开头,连接远程`Playwright Server`。也是无头模式,但无法指定`user_data_dir`,所以使用受限
|
|
139
|
+
- 参考:https://playwright.dev/python/docs/docker#running-the-playwright-server
|
|
140
|
+
|
|
131
141
|
## MCP支持
|
|
132
142
|
|
|
133
143
|
确保可以在控制台中执行`python -m mcp_query_table -h`。如果不能,可能要先`pip install mcp_query_table`
|
|
134
144
|
|
|
135
|
-
在`Cline`中可以配置如下。其中`command`是`python`的绝对路径,`
|
|
136
|
-
|
|
145
|
+
在`Cline`中可以配置如下。其中`command`是`python`的绝对路径,`timeout`是超时时间,单位为秒。 在各`AI`
|
|
146
|
+
平台中由于返回时间常需1分钟以上,所以需要设置大的超时时间。
|
|
137
147
|
|
|
138
148
|
### STDIO方式
|
|
139
149
|
|
|
@@ -148,7 +158,7 @@ if __name__ == '__main__':
|
|
|
148
158
|
"mcp_query_table",
|
|
149
159
|
"--format",
|
|
150
160
|
"markdown",
|
|
151
|
-
"--
|
|
161
|
+
"--endpoint",
|
|
152
162
|
"http://127.0.0.1:9222",
|
|
153
163
|
"--executable_path",
|
|
154
164
|
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
|
|
@@ -163,7 +173,7 @@ if __name__ == '__main__':
|
|
|
163
173
|
先在控制台中执行如下命令,启动`MCP`服务
|
|
164
174
|
|
|
165
175
|
```commandline
|
|
166
|
-
python -m mcp_query_table --format markdown --transport sse --port 8000
|
|
176
|
+
python -m mcp_query_table --format markdown --transport sse --port 8000 --endpoint http://127.0.0.1:9222
|
|
167
177
|
```
|
|
168
178
|
|
|
169
179
|
然后就可以连接到`MCP`服务了
|
|
@@ -182,7 +192,7 @@ python -m mcp_query_table --format markdown --transport sse --port 8000
|
|
|
182
192
|
## 使用`MCP Inspector`进行调试
|
|
183
193
|
|
|
184
194
|
```commandline
|
|
185
|
-
npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown
|
|
195
|
+
npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown --endpoint http://127.0.0.1:9222
|
|
186
196
|
```
|
|
187
197
|
|
|
188
198
|
打开浏览器并翻页是一个比较耗时的操作,会导致`MCP Inspector`页面超时,可以`http://localhost:5173/?timeout=300000`
|
|
@@ -212,6 +222,5 @@ npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown
|
|
|
212
222
|

|
|
213
223
|
|
|
214
224
|
## 参考
|
|
215
|
-
|
|
216
|
-
- [Playwright](https://playwright.dev/python/docs/intro)
|
|
217
225
|
- [Selenium webdriver无法附加到edge实例,edge的--remote-debugging-port选项无效](https://blog.csdn.net/qq_30576521/article/details/142370538)
|
|
226
|
+
- https://github.com/AtuboDad/playwright_stealth/issues/31
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
1. 基于`playwright`实现的财经网页表格爬虫,支持`Model Context Protocol (MCP) `。目前可查询来源为
|
|
4
4
|
|
|
5
|
-
- [
|
|
5
|
+
- [同花顺问财](http://iwencai.com/)
|
|
6
6
|
- [通达信问小达](https://wenda.tdx.com.cn/)
|
|
7
7
|
- [东方财富条件选股](https://xuangu.eastmoney.com/)
|
|
8
8
|
|
|
@@ -31,7 +31,7 @@ from mcp_query_table import *
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
async def main() -> None:
|
|
34
|
-
async with BrowserManager(
|
|
34
|
+
async with BrowserManager(endpoint="http://127.0.0.1:9222", executable_path=None, devtools=True) as bm:
|
|
35
35
|
# 问财需要保证浏览器宽度>768,防止界面变成适应手机
|
|
36
36
|
page = await bm.get_page()
|
|
37
37
|
df = await query(page, '收益最好的200只ETF', query_type=QueryType.ETF, max_page=1, site=Site.THS)
|
|
@@ -89,12 +89,21 @@ if __name__ == '__main__':
|
|
|
89
89
|
|
|
90
90
|
后期会根据不同的网站改版情况,使用更适合的方法。
|
|
91
91
|
|
|
92
|
+
## 无头模式
|
|
93
|
+
|
|
94
|
+
无头模式运行速度更快,但部分网站需要提前登录,所以,无头模式一定要指定`user_data_dir`,否则会出现需要登录的情况。
|
|
95
|
+
|
|
96
|
+
- `endpoint=None`时,`headless=True`可无头启动新浏览器实例。指定`executable_path`和`user_data_dir`,才能确保无头模式下正常运行。
|
|
97
|
+
- `endpoint`以`http://`开头,连接`CDP`模式启动的有头浏览器,参数必有`--remote-debugging-port`。`executable_path`为本地浏览器路径。
|
|
98
|
+
- `endpoint`以`ws://`开头,连接远程`Playwright Server`。也是无头模式,但无法指定`user_data_dir`,所以使用受限
|
|
99
|
+
- 参考:https://playwright.dev/python/docs/docker#running-the-playwright-server
|
|
100
|
+
|
|
92
101
|
## MCP支持
|
|
93
102
|
|
|
94
103
|
确保可以在控制台中执行`python -m mcp_query_table -h`。如果不能,可能要先`pip install mcp_query_table`
|
|
95
104
|
|
|
96
|
-
在`Cline`中可以配置如下。其中`command`是`python`的绝对路径,`
|
|
97
|
-
|
|
105
|
+
在`Cline`中可以配置如下。其中`command`是`python`的绝对路径,`timeout`是超时时间,单位为秒。 在各`AI`
|
|
106
|
+
平台中由于返回时间常需1分钟以上,所以需要设置大的超时时间。
|
|
98
107
|
|
|
99
108
|
### STDIO方式
|
|
100
109
|
|
|
@@ -109,7 +118,7 @@ if __name__ == '__main__':
|
|
|
109
118
|
"mcp_query_table",
|
|
110
119
|
"--format",
|
|
111
120
|
"markdown",
|
|
112
|
-
"--
|
|
121
|
+
"--endpoint",
|
|
113
122
|
"http://127.0.0.1:9222",
|
|
114
123
|
"--executable_path",
|
|
115
124
|
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
|
|
@@ -124,7 +133,7 @@ if __name__ == '__main__':
|
|
|
124
133
|
先在控制台中执行如下命令,启动`MCP`服务
|
|
125
134
|
|
|
126
135
|
```commandline
|
|
127
|
-
python -m mcp_query_table --format markdown --transport sse --port 8000
|
|
136
|
+
python -m mcp_query_table --format markdown --transport sse --port 8000 --endpoint http://127.0.0.1:9222
|
|
128
137
|
```
|
|
129
138
|
|
|
130
139
|
然后就可以连接到`MCP`服务了
|
|
@@ -143,7 +152,7 @@ python -m mcp_query_table --format markdown --transport sse --port 8000
|
|
|
143
152
|
## 使用`MCP Inspector`进行调试
|
|
144
153
|
|
|
145
154
|
```commandline
|
|
146
|
-
npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown
|
|
155
|
+
npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown --endpoint http://127.0.0.1:9222
|
|
147
156
|
```
|
|
148
157
|
|
|
149
158
|
打开浏览器并翻页是一个比较耗时的操作,会导致`MCP Inspector`页面超时,可以`http://localhost:5173/?timeout=300000`
|
|
@@ -173,6 +182,5 @@ npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown
|
|
|
173
182
|

|
|
174
183
|
|
|
175
184
|
## 参考
|
|
176
|
-
|
|
177
|
-
-
|
|
178
|
-
- [Selenium webdriver无法附加到edge实例,edge的--remote-debugging-port选项无效](https://blog.csdn.net/qq_30576521/article/details/142370538)
|
|
185
|
+
- [Selenium webdriver无法附加到edge实例,edge的--remote-debugging-port选项无效](https://blog.csdn.net/qq_30576521/article/details/142370538)
|
|
186
|
+
- https://github.com/AtuboDad/playwright_stealth/issues/31
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import getpass
|
|
2
|
+
|
|
1
3
|
from mcp_query_table.server import serve
|
|
2
4
|
|
|
3
5
|
|
|
@@ -10,11 +12,12 @@ def main():
|
|
|
10
12
|
|
|
11
13
|
parser.add_argument("--format", type=str, help="输出格式",
|
|
12
14
|
default='markdown', choices=['markdown', 'csv', 'json'])
|
|
13
|
-
parser.add_argument("--
|
|
14
|
-
default=
|
|
15
|
-
parser.add_argument("--executable_path", type=str, help="
|
|
16
|
-
default=r'C:\Program Files\Google\Chrome\Application\chrome.exe')
|
|
17
|
-
|
|
15
|
+
parser.add_argument("--endpoint", type=str, help="浏览器CDP地址/WS地址",
|
|
16
|
+
nargs="?", default=r'http://127.0.0.1:9222')
|
|
17
|
+
parser.add_argument("--executable_path", type=str, help="浏览器路径",
|
|
18
|
+
nargs="?", default=r'C:\Program Files\Google\Chrome\Application\chrome.exe')
|
|
19
|
+
parser.add_argument("--user_data_dir", type=str, help="浏览器用户数据目录",
|
|
20
|
+
nargs="?", default=rf'C:\Users\{getpass.getuser()}\AppData\Local\Google\Chrome\User Data')
|
|
18
21
|
parser.add_argument("--transport", type=str, help="传输类型",
|
|
19
22
|
default='stdio', choices=['stdio', 'sse'])
|
|
20
23
|
parser.add_argument("--host", type=str, help="MCP服务端绑定地址",
|
|
@@ -22,7 +25,8 @@ def main():
|
|
|
22
25
|
parser.add_argument("--port", type=int, help="MCP服务端绑定端口",
|
|
23
26
|
default='8000')
|
|
24
27
|
args = parser.parse_args()
|
|
25
|
-
serve(args.format, args.
|
|
28
|
+
serve(args.format, args.endpoint,
|
|
29
|
+
args.executable_path, args.user_data_dir,
|
|
26
30
|
args.transport, args.host, args.port)
|
|
27
31
|
|
|
28
32
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.7"
|
|
@@ -8,7 +8,7 @@ import json
|
|
|
8
8
|
from playwright.async_api import Page
|
|
9
9
|
|
|
10
10
|
import mcp_query_table
|
|
11
|
-
from mcp_query_table.
|
|
11
|
+
from mcp_query_table.utils import split_images, GlobalVars
|
|
12
12
|
|
|
13
13
|
_PAGE0_ = "https://chat.baidu.com/search"
|
|
14
14
|
_PAGE1_ = "https://chat.baidu.com/aichat/api/conversation"
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
"""
|
|
4
4
|
import json
|
|
5
5
|
|
|
6
|
+
from loguru import logger
|
|
6
7
|
from playwright.async_api import Page
|
|
7
8
|
|
|
8
9
|
import mcp_query_table
|
|
9
|
-
from mcp_query_table.
|
|
10
|
+
from mcp_query_table.utils import is_image, GlobalVars
|
|
10
11
|
|
|
11
12
|
_PAGE0_ = "https://www.n.cn"
|
|
12
13
|
_PAGE1_ = "https://www.n.cn/search"
|
|
@@ -78,6 +79,8 @@ async def chat(page: Page,
|
|
|
78
79
|
str
|
|
79
80
|
回答
|
|
80
81
|
"""
|
|
82
|
+
logger.warning("纳米搜索。不登录可以使用。但无头模式要指定`user_data_dir`才能正常工作")
|
|
83
|
+
|
|
81
84
|
if not create:
|
|
82
85
|
if not page.url.startswith(_PAGE1_):
|
|
83
86
|
create = True
|
|
@@ -102,6 +105,7 @@ async def chat(page: Page,
|
|
|
102
105
|
textbox = page.get_by_role("textbox", name=name)
|
|
103
106
|
await textbox.fill(prompt)
|
|
104
107
|
await textbox.press("Enter")
|
|
108
|
+
# await page.screenshot(path="n.png")
|
|
105
109
|
await on_response(await response_info.value)
|
|
106
110
|
|
|
107
111
|
return G.get_text()
|
|
@@ -4,10 +4,11 @@
|
|
|
4
4
|
import json
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
|
+
from loguru import logger
|
|
7
8
|
from playwright.async_api import Page
|
|
8
9
|
|
|
9
10
|
import mcp_query_table
|
|
10
|
-
from mcp_query_table.
|
|
11
|
+
from mcp_query_table.utils import split_images, GlobalVars
|
|
11
12
|
|
|
12
13
|
_PAGE0_ = "https://yuanbao.tencent.com/"
|
|
13
14
|
_PAGE1_ = "https://yuanbao.tencent.com/api/chat"
|
|
@@ -69,6 +70,8 @@ async def chat(page: Page,
|
|
|
69
70
|
create: bool,
|
|
70
71
|
files: list[str]
|
|
71
72
|
) -> str:
|
|
73
|
+
logger.info("腾讯元宝。登录才可以使用。无头模式时要指定`user_data_dir`才能正常工作")
|
|
74
|
+
|
|
72
75
|
if not page.url.startswith(_PAGE0_):
|
|
73
76
|
create = True
|
|
74
77
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Annotated,
|
|
1
|
+
from typing import Annotated, List
|
|
2
2
|
|
|
3
3
|
from loguru import logger
|
|
4
4
|
from mcp.server.fastmcp import FastMCP
|
|
@@ -10,12 +10,17 @@ from mcp_query_table.tool import BrowserManager
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class QueryServer:
|
|
13
|
-
def __init__(self
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
self.format: str = "markdown"
|
|
15
|
+
self.browser = None
|
|
16
|
+
|
|
17
|
+
def start(self, format, endpoint, executable_path, user_data_dir):
|
|
17
18
|
self.format: str = format
|
|
18
|
-
self.browser = BrowserManager(
|
|
19
|
+
self.browser = BrowserManager(endpoint=endpoint,
|
|
20
|
+
executable_path=executable_path,
|
|
21
|
+
user_data_dir=user_data_dir,
|
|
22
|
+
devtools=False,
|
|
23
|
+
headless=True)
|
|
19
24
|
|
|
20
25
|
async def query(self, query_input: str, query_type: QueryType, max_page: int, site: Site):
|
|
21
26
|
page = await self.browser.get_page()
|
|
@@ -64,14 +69,15 @@ async def chat(
|
|
|
64
69
|
return await qsv.chat(prompt, create, files, provider)
|
|
65
70
|
|
|
66
71
|
|
|
67
|
-
def serve(format,
|
|
68
|
-
qsv.format
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
logger.info(f"{
|
|
72
|
-
logger.info(f"{cdp_endpoint=},{executable_path=}")
|
|
72
|
+
def serve(format, endpoint, executable_path, user_data_dir, transport, host, port):
|
|
73
|
+
qsv.start(format, endpoint, executable_path, user_data_dir)
|
|
74
|
+
logger.info(f"{endpoint=}")
|
|
75
|
+
logger.info(f"{executable_path=}")
|
|
76
|
+
logger.info(f"{user_data_dir=}")
|
|
73
77
|
if transport == 'sse':
|
|
74
|
-
logger.info(f"{
|
|
78
|
+
logger.info(f"{transport=},{format=},{host=},{port=}")
|
|
79
|
+
else:
|
|
80
|
+
logger.info(f"{transport=},{format=}")
|
|
75
81
|
|
|
76
82
|
mcp.settings.host = host
|
|
77
83
|
mcp.settings.port = port
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
同花顺问财
|
|
3
3
|
https://www.iwencai.com/
|
|
4
4
|
|
|
5
5
|
1. 一定要保证浏览器宽度>768,防止界面变成适应手机
|
|
@@ -10,8 +10,10 @@ import re
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from loguru import logger
|
|
12
12
|
from playwright.async_api import Page
|
|
13
|
+
from playwright_stealth import stealth_async
|
|
13
14
|
|
|
14
15
|
from mcp_query_table.enums import QueryType
|
|
16
|
+
from mcp_query_table.utils import FixedConfig
|
|
15
17
|
|
|
16
18
|
# 初次查询页面
|
|
17
19
|
_PAGE1_ = 'https://www.iwencai.com/customized/chart/get-robot-data'
|
|
@@ -152,6 +154,8 @@ async def query(page: Page,
|
|
|
152
154
|
querytype = _querytype_.get(type_, None)
|
|
153
155
|
assert querytype is not None, f"不支持的类型:{type_}"
|
|
154
156
|
|
|
157
|
+
await stealth_async(page, FixedConfig())
|
|
158
|
+
|
|
155
159
|
await page.route(re.compile(r'.*\.(?:jpg|jpeg|png|gif|webp)(?:$|\?)'), lambda route: route.abort())
|
|
156
160
|
|
|
157
161
|
P.reset()
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
import getpass
|
|
1
2
|
import subprocess
|
|
2
3
|
import sys
|
|
3
4
|
import time
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
6
8
|
|
|
7
9
|
import pandas as pd
|
|
8
10
|
from loguru import logger
|
|
@@ -25,6 +27,7 @@ def create_detached_process(command):
|
|
|
25
27
|
kwargs.update({
|
|
26
28
|
'start_new_session': True # 创建新的会话
|
|
27
29
|
})
|
|
30
|
+
logger.info(f"Popen: {command}")
|
|
28
31
|
return subprocess.Popen(command, **kwargs)
|
|
29
32
|
|
|
30
33
|
|
|
@@ -36,6 +39,13 @@ def is_local_url(url: str) -> bool:
|
|
|
36
39
|
return False
|
|
37
40
|
|
|
38
41
|
|
|
42
|
+
def is_cdp_url(url: str) -> bool:
|
|
43
|
+
"""判断url是否是CDP地址"""
|
|
44
|
+
if url.startswith('ws://') or url.startswith('wss://'):
|
|
45
|
+
return False
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
|
|
39
49
|
def get_executable_path(executable_path) -> Optional[str]:
|
|
40
50
|
"""获取浏览器可执行文件路径"""
|
|
41
51
|
browsers = {
|
|
@@ -51,6 +61,21 @@ def get_executable_path(executable_path) -> Optional[str]:
|
|
|
51
61
|
return None
|
|
52
62
|
|
|
53
63
|
|
|
64
|
+
def get_user_data_dir(user_data_dir) -> Optional[str]:
|
|
65
|
+
"""获取浏览器可用户目录"""
|
|
66
|
+
browsers = {
|
|
67
|
+
"default": user_data_dir,
|
|
68
|
+
"chrome.exe": rf'C:\Users\{getpass.getuser()}\AppData\Local\Google\Chrome\User Data',
|
|
69
|
+
"msedge.exe": rf"C:\Users\{getpass.getuser()}\AppData\Local\Microsoft\Edge\User Data",
|
|
70
|
+
}
|
|
71
|
+
for k, v in browsers.items():
|
|
72
|
+
if v is None:
|
|
73
|
+
continue
|
|
74
|
+
if Path(v).exists():
|
|
75
|
+
return v
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
54
79
|
class BrowserManager:
|
|
55
80
|
async def __aenter__(self):
|
|
56
81
|
return self
|
|
@@ -59,24 +84,36 @@ class BrowserManager:
|
|
|
59
84
|
await self.cleanup()
|
|
60
85
|
|
|
61
86
|
def __init__(self,
|
|
62
|
-
|
|
87
|
+
endpoint: Optional[str],
|
|
63
88
|
executable_path: Optional[str] = None,
|
|
64
|
-
|
|
89
|
+
devtools: bool = False,
|
|
90
|
+
headless: bool = True,
|
|
91
|
+
user_data_dir: Optional[str] = None):
|
|
65
92
|
"""
|
|
66
93
|
|
|
67
94
|
Parameters
|
|
68
95
|
----------
|
|
69
|
-
|
|
70
|
-
浏览器CDP
|
|
96
|
+
endpoint:str or None
|
|
97
|
+
浏览器CDP地址/WS地址。
|
|
98
|
+
如果为None,则直接启动浏览器实例。可用无头模式。建议指定用户数据目录,否则可能无法使用某些需要登录的网站
|
|
71
99
|
executable_path:str
|
|
72
100
|
浏览器可执行文件路径。推荐使用chrome,因为Microsoft Edge必须在任务管理器中完全退出才能启动调试端口
|
|
73
|
-
|
|
101
|
+
devtools:bool
|
|
74
102
|
是否显示开发者工具
|
|
103
|
+
headless:bool
|
|
104
|
+
是否无头模式启动浏览器
|
|
105
|
+
user_data_dir:str
|
|
106
|
+
浏览器用户数据目录。无头模式。强烈建议指定用户数据目录,否则可能无法使用某些需要登录的网站
|
|
75
107
|
|
|
76
108
|
"""
|
|
77
|
-
|
|
109
|
+
if devtools:
|
|
110
|
+
headless = False
|
|
111
|
+
|
|
112
|
+
self.endpoint = endpoint
|
|
78
113
|
self.executable_path = executable_path
|
|
79
|
-
self.
|
|
114
|
+
self.devtools = devtools
|
|
115
|
+
self.headless = headless
|
|
116
|
+
self.user_data_dir = user_data_dir
|
|
80
117
|
|
|
81
118
|
self.playwright: Optional[Playwright] = None
|
|
82
119
|
self.browser = None
|
|
@@ -92,34 +129,57 @@ class BrowserManager:
|
|
|
92
129
|
|
|
93
130
|
async def _connect_to_local(self) -> None:
|
|
94
131
|
"""连接本地浏览器"""
|
|
95
|
-
port = self.
|
|
132
|
+
port = urlparse(self.endpoint).port
|
|
96
133
|
executable_path = get_executable_path(self.executable_path)
|
|
134
|
+
name = Path(executable_path).name
|
|
97
135
|
command = [executable_path, f'--remote-debugging-port={port}', '--start-maximized']
|
|
98
|
-
if self.
|
|
136
|
+
if self.devtools:
|
|
99
137
|
command.append('--auto-open-devtools-for-tabs')
|
|
100
138
|
|
|
101
139
|
for i in range(2):
|
|
102
140
|
try:
|
|
103
|
-
self.browser = await self.playwright.chromium.connect_over_cdp(self.
|
|
141
|
+
self.browser = await self.playwright.chromium.connect_over_cdp(self.endpoint,
|
|
104
142
|
timeout=10000, slow_mo=1000)
|
|
105
143
|
break
|
|
106
144
|
except:
|
|
107
145
|
if i == 0:
|
|
108
|
-
logger.info(f"start browser:{command}")
|
|
109
146
|
create_detached_process(command)
|
|
110
147
|
time.sleep(3)
|
|
111
148
|
continue
|
|
112
149
|
if i == 1:
|
|
113
150
|
raise ConnectionError(
|
|
114
|
-
f"已提前打开了浏览器,但未开启远程调试端口?请关闭浏览器全部进程后重试 `taskkill /f /im {
|
|
151
|
+
f"已提前打开了浏览器,但未开启远程调试端口?请关闭浏览器全部进程后重试 `taskkill /f /im {name}`")
|
|
115
152
|
|
|
116
153
|
async def _connect_to_remote(self) -> None:
|
|
117
154
|
"""连接远程浏览器"""
|
|
118
155
|
try:
|
|
119
|
-
|
|
120
|
-
|
|
156
|
+
if is_cdp_url(self.endpoint):
|
|
157
|
+
self.browser = await self.playwright.chromium.connect_over_cdp(self.endpoint,
|
|
158
|
+
timeout=10000, slow_mo=1000)
|
|
159
|
+
else:
|
|
160
|
+
self.browser = await self.playwright.chromium.connect(self.endpoint,
|
|
161
|
+
timeout=10000, slow_mo=1000)
|
|
121
162
|
except:
|
|
122
|
-
raise ConnectionError(f"连接远程浏览器失败,请检查CDP地址和端口是否正确。{self.
|
|
163
|
+
raise ConnectionError(f"连接远程浏览器失败,请检查CDP/WS地址和端口是否正确。{self.endpoint}")
|
|
164
|
+
|
|
165
|
+
async def _connect_to_launch(self) -> None:
|
|
166
|
+
logger.info("executable_path={}", self.executable_path)
|
|
167
|
+
if self.user_data_dir:
|
|
168
|
+
logger.info("user_data_dir={}", self.user_data_dir)
|
|
169
|
+
try:
|
|
170
|
+
self.context = await self.playwright.chromium.launch_persistent_context(
|
|
171
|
+
user_data_dir=self.user_data_dir,
|
|
172
|
+
executable_path=self.executable_path,
|
|
173
|
+
headless=self.headless,
|
|
174
|
+
devtools=self.devtools)
|
|
175
|
+
except:
|
|
176
|
+
raise ConnectionError(f"launch失败,可能已经有浏览器已经打开了数据目录。{self.user_data_dir}")
|
|
177
|
+
else:
|
|
178
|
+
logger.warning("未指定浏览器用户数据目录,部分需要的网站可能无法使用")
|
|
179
|
+
self.browser = await self.playwright.chromium.launch(
|
|
180
|
+
executable_path=self.executable_path,
|
|
181
|
+
headless=self.headless,
|
|
182
|
+
devtools=self.devtools)
|
|
123
183
|
|
|
124
184
|
async def _launch(self) -> None:
|
|
125
185
|
"""启动浏览器,并连接CDP协议
|
|
@@ -130,13 +190,20 @@ class BrowserManager:
|
|
|
130
190
|
|
|
131
191
|
"""
|
|
132
192
|
self.playwright = await async_playwright().start()
|
|
133
|
-
|
|
134
|
-
|
|
193
|
+
if self.endpoint is None:
|
|
194
|
+
await self._connect_to_launch()
|
|
195
|
+
elif is_local_url(self.endpoint) and is_cdp_url(self.endpoint):
|
|
135
196
|
await self._connect_to_local()
|
|
136
197
|
else:
|
|
137
198
|
await self._connect_to_remote()
|
|
138
199
|
|
|
139
|
-
self.
|
|
200
|
+
if self.browser is None:
|
|
201
|
+
pass
|
|
202
|
+
elif len(self.browser.contexts) == 0:
|
|
203
|
+
self.context = await self.browser.new_context()
|
|
204
|
+
else:
|
|
205
|
+
self.context = self.browser.contexts[0]
|
|
206
|
+
|
|
140
207
|
# 复用打开的page
|
|
141
208
|
for page in self.context.pages:
|
|
142
209
|
# 防止开发者工具被使用
|
|
@@ -150,15 +217,10 @@ class BrowserManager:
|
|
|
150
217
|
continue
|
|
151
218
|
self.pages.append(page)
|
|
152
219
|
|
|
153
|
-
async def _try_launch(self) -> None:
|
|
154
|
-
if self.browser is None:
|
|
155
|
-
await self._launch()
|
|
156
|
-
if not self.browser.is_connected():
|
|
157
|
-
await self._launch()
|
|
158
|
-
|
|
159
220
|
async def get_page(self) -> Page:
|
|
160
221
|
"""获取可用Page。无空闲标签时会打开新标签"""
|
|
161
|
-
|
|
222
|
+
if self.context is None:
|
|
223
|
+
await self._launch()
|
|
162
224
|
|
|
163
225
|
# 反复取第一个tab
|
|
164
226
|
while len(self.pages) > 0:
|
|
@@ -178,19 +240,6 @@ class BrowserManager:
|
|
|
178
240
|
self.pages.append(page)
|
|
179
241
|
|
|
180
242
|
|
|
181
|
-
class GlobalVars:
|
|
182
|
-
"""全局变量"""
|
|
183
|
-
|
|
184
|
-
def __init__(self):
|
|
185
|
-
self.text = ""
|
|
186
|
-
|
|
187
|
-
def set_text(self, text):
|
|
188
|
-
self.text = text
|
|
189
|
-
|
|
190
|
-
def get_text(self):
|
|
191
|
-
return self.text
|
|
192
|
-
|
|
193
|
-
|
|
194
243
|
async def query(
|
|
195
244
|
page: Page,
|
|
196
245
|
query_input: str = "收盘价>100元",
|
|
@@ -274,22 +323,3 @@ async def chat(
|
|
|
274
323
|
return await chat(page, prompt, create, files)
|
|
275
324
|
|
|
276
325
|
raise ValueError(f"未支持的提供商:{provider}")
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
def is_image(path: str) -> bool:
|
|
280
|
-
"""判断是否是图片文件"""
|
|
281
|
-
img_ext = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
|
|
282
|
-
ext = Path(path).suffix.lower()
|
|
283
|
-
return ext in img_ext
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
def split_images(files: List[str]) -> Tuple[List[str], List[str]]:
|
|
287
|
-
"""图片列表分成两部分"""
|
|
288
|
-
imgs = []
|
|
289
|
-
docs = []
|
|
290
|
-
for f in files:
|
|
291
|
-
if is_image(f):
|
|
292
|
-
imgs.append(f)
|
|
293
|
-
else:
|
|
294
|
-
docs.append(f)
|
|
295
|
-
return imgs, docs
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from playwright_stealth import StealthConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def is_image(path: str) -> bool:
|
|
10
|
+
"""判断是否是图片文件"""
|
|
11
|
+
img_ext = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
|
|
12
|
+
ext = Path(path).suffix.lower()
|
|
13
|
+
return ext in img_ext
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def split_images(files: List[str]) -> Tuple[List[str], List[str]]:
|
|
17
|
+
"""图片列表分成两部分"""
|
|
18
|
+
imgs = []
|
|
19
|
+
docs = []
|
|
20
|
+
for f in files:
|
|
21
|
+
if is_image(f):
|
|
22
|
+
imgs.append(f)
|
|
23
|
+
else:
|
|
24
|
+
docs.append(f)
|
|
25
|
+
return imgs, docs
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GlobalVars:
|
|
29
|
+
"""全局变量"""
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
self.text = ""
|
|
33
|
+
|
|
34
|
+
def set_text(self, text):
|
|
35
|
+
self.text = text
|
|
36
|
+
|
|
37
|
+
def get_text(self):
|
|
38
|
+
return self.text
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# https://github.com/AtuboDad/playwright_stealth/issues/31#issuecomment-2342541305
|
|
42
|
+
class FixedConfig(StealthConfig):
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def enabled_scripts(self):
|
|
46
|
+
key = "".join(random.choices(string.ascii_letters, k=10))
|
|
47
|
+
for script in super().enabled_scripts:
|
|
48
|
+
if "const opts" in script:
|
|
49
|
+
yield script.replace("const opts", f"window.{key}")
|
|
50
|
+
continue
|
|
51
|
+
yield script.replace("opts", f"window.{key}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcp_query_table
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: query table from website, support MCP
|
|
5
5
|
Author-email: wukan <wu-kan@163.com>
|
|
6
6
|
License: MIT License
|
|
@@ -34,6 +34,7 @@ License-File: LICENSE
|
|
|
34
34
|
Requires-Dist: pandas
|
|
35
35
|
Requires-Dist: loguru
|
|
36
36
|
Requires-Dist: playwright
|
|
37
|
+
Requires-Dist: playwright-stealth
|
|
37
38
|
Requires-Dist: mcp
|
|
38
39
|
Dynamic: license-file
|
|
39
40
|
|
|
@@ -41,7 +42,7 @@ Dynamic: license-file
|
|
|
41
42
|
|
|
42
43
|
1. 基于`playwright`实现的财经网页表格爬虫,支持`Model Context Protocol (MCP) `。目前可查询来源为
|
|
43
44
|
|
|
44
|
-
- [
|
|
45
|
+
- [同花顺问财](http://iwencai.com/)
|
|
45
46
|
- [通达信问小达](https://wenda.tdx.com.cn/)
|
|
46
47
|
- [东方财富条件选股](https://xuangu.eastmoney.com/)
|
|
47
48
|
|
|
@@ -70,7 +71,7 @@ from mcp_query_table import *
|
|
|
70
71
|
|
|
71
72
|
|
|
72
73
|
async def main() -> None:
|
|
73
|
-
async with BrowserManager(
|
|
74
|
+
async with BrowserManager(endpoint="http://127.0.0.1:9222", executable_path=None, devtools=True) as bm:
|
|
74
75
|
# 问财需要保证浏览器宽度>768,防止界面变成适应手机
|
|
75
76
|
page = await bm.get_page()
|
|
76
77
|
df = await query(page, '收益最好的200只ETF', query_type=QueryType.ETF, max_page=1, site=Site.THS)
|
|
@@ -128,12 +129,21 @@ if __name__ == '__main__':
|
|
|
128
129
|
|
|
129
130
|
后期会根据不同的网站改版情况,使用更适合的方法。
|
|
130
131
|
|
|
132
|
+
## 无头模式
|
|
133
|
+
|
|
134
|
+
无头模式运行速度更快,但部分网站需要提前登录,所以,无头模式一定要指定`user_data_dir`,否则会出现需要登录的情况。
|
|
135
|
+
|
|
136
|
+
- `endpoint=None`时,`headless=True`可无头启动新浏览器实例。指定`executable_path`和`user_data_dir`,才能确保无头模式下正常运行。
|
|
137
|
+
- `endpoint`以`http://`开头,连接`CDP`模式启动的有头浏览器,参数必有`--remote-debugging-port`。`executable_path`为本地浏览器路径。
|
|
138
|
+
- `endpoint`以`ws://`开头,连接远程`Playwright Server`。也是无头模式,但无法指定`user_data_dir`,所以使用受限
|
|
139
|
+
- 参考:https://playwright.dev/python/docs/docker#running-the-playwright-server
|
|
140
|
+
|
|
131
141
|
## MCP支持
|
|
132
142
|
|
|
133
143
|
确保可以在控制台中执行`python -m mcp_query_table -h`。如果不能,可能要先`pip install mcp_query_table`
|
|
134
144
|
|
|
135
|
-
在`Cline`中可以配置如下。其中`command`是`python`的绝对路径,`
|
|
136
|
-
|
|
145
|
+
在`Cline`中可以配置如下。其中`command`是`python`的绝对路径,`timeout`是超时时间,单位为秒。 在各`AI`
|
|
146
|
+
平台中由于返回时间常需1分钟以上,所以需要设置大的超时时间。
|
|
137
147
|
|
|
138
148
|
### STDIO方式
|
|
139
149
|
|
|
@@ -148,7 +158,7 @@ if __name__ == '__main__':
|
|
|
148
158
|
"mcp_query_table",
|
|
149
159
|
"--format",
|
|
150
160
|
"markdown",
|
|
151
|
-
"--
|
|
161
|
+
"--endpoint",
|
|
152
162
|
"http://127.0.0.1:9222",
|
|
153
163
|
"--executable_path",
|
|
154
164
|
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
|
|
@@ -163,7 +173,7 @@ if __name__ == '__main__':
|
|
|
163
173
|
先在控制台中执行如下命令,启动`MCP`服务
|
|
164
174
|
|
|
165
175
|
```commandline
|
|
166
|
-
python -m mcp_query_table --format markdown --transport sse --port 8000
|
|
176
|
+
python -m mcp_query_table --format markdown --transport sse --port 8000 --endpoint http://127.0.0.1:9222
|
|
167
177
|
```
|
|
168
178
|
|
|
169
179
|
然后就可以连接到`MCP`服务了
|
|
@@ -182,7 +192,7 @@ python -m mcp_query_table --format markdown --transport sse --port 8000
|
|
|
182
192
|
## 使用`MCP Inspector`进行调试
|
|
183
193
|
|
|
184
194
|
```commandline
|
|
185
|
-
npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown
|
|
195
|
+
npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown --endpoint http://127.0.0.1:9222
|
|
186
196
|
```
|
|
187
197
|
|
|
188
198
|
打开浏览器并翻页是一个比较耗时的操作,会导致`MCP Inspector`页面超时,可以`http://localhost:5173/?timeout=300000`
|
|
@@ -212,6 +222,5 @@ npx @modelcontextprotocol/inspector python -m mcp_query_table --format markdown
|
|
|
212
222
|

|
|
213
223
|
|
|
214
224
|
## 参考
|
|
215
|
-
|
|
216
|
-
- [Playwright](https://playwright.dev/python/docs/intro)
|
|
217
225
|
- [Selenium webdriver无法附加到edge实例,edge的--remote-debugging-port选项无效](https://blog.csdn.net/qq_30576521/article/details/142370538)
|
|
226
|
+
- https://github.com/AtuboDad/playwright_stealth/issues/31
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.5"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mcp_query_table-0.3.5 → mcp_query_table-0.3.7}/mcp_query_table.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|