python-qlv-helper 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- python_qlv_helper-0.2.0.dist-info/METADATA +252 -0
- python_qlv_helper-0.2.0.dist-info/RECORD +32 -0
- python_qlv_helper-0.2.0.dist-info/WHEEL +5 -0
- python_qlv_helper-0.2.0.dist-info/licenses/LICENSE +201 -0
- python_qlv_helper-0.2.0.dist-info/top_level.txt +1 -0
- qlv_helper/__init__.py +11 -0
- qlv_helper/controller/__init__.py +11 -0
- qlv_helper/controller/domestic_activity_order.py +24 -0
- qlv_helper/controller/main_page.py +30 -0
- qlv_helper/controller/order_detail.py +35 -0
- qlv_helper/controller/order_table.py +145 -0
- qlv_helper/controller/user_login.py +119 -0
- qlv_helper/http/__init__.py +11 -0
- qlv_helper/http/main_page.py +41 -0
- qlv_helper/http/order_page.py +313 -0
- qlv_helper/http/order_table_page.py +323 -0
- qlv_helper/po/__init__.py +11 -0
- qlv_helper/po/base_po.py +40 -0
- qlv_helper/po/domestic_activity_order_page.py +129 -0
- qlv_helper/po/login_page.py +136 -0
- qlv_helper/po/main_page.py +71 -0
- qlv_helper/po/wechat_auth_page.py +68 -0
- qlv_helper/utils/__init__.py +11 -0
- qlv_helper/utils/browser_utils.py +25 -0
- qlv_helper/utils/datetime_utils.py +16 -0
- qlv_helper/utils/file_handle.py +33 -0
- qlv_helper/utils/html_utils.py +59 -0
- qlv_helper/utils/ocr_helper.py +83 -0
- qlv_helper/utils/po_utils.py +113 -0
- qlv_helper/utils/stealth_browser.py +100 -0
- qlv_helper/utils/type_utils.py +111 -0
- qlv_helper/utils/windows_utils.py +36 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
4
|
+
# ProjectName: qlv-helper
|
|
5
|
+
# FileName: order_table.py
|
|
6
|
+
# Description: 订单列表页面控制器
|
|
7
|
+
# Author: ASUS
|
|
8
|
+
# CreateDate: 2025/12/01
|
|
9
|
+
# Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
|
|
10
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
11
|
+
"""
|
|
12
|
+
import asyncio
|
|
13
|
+
from aiohttp import CookieJar
|
|
14
|
+
from typing import Optional, Dict, Any, Callable, List
|
|
15
|
+
from http_helper.client.async_proxy import HttpClientFactory
|
|
16
|
+
from qlv_helper.utils.html_utils import parse_pagination_info
|
|
17
|
+
from qlv_helper.http.order_table_page import get_domestic_activity_order_page_html, get_domestic_ticket_outed_page_html, \
|
|
18
|
+
parse_order_table
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def _get_paginated_order_table(
|
|
22
|
+
*,
|
|
23
|
+
domain: str,
|
|
24
|
+
protocol: str,
|
|
25
|
+
retry: int,
|
|
26
|
+
timeout: int,
|
|
27
|
+
enable_log: bool,
|
|
28
|
+
cookie_jar: Optional[CookieJar],
|
|
29
|
+
playwright_state: Dict[str, Any],
|
|
30
|
+
table_state: str,
|
|
31
|
+
fetch_page_fn: Callable[..., Any], # 拿到第一页/分页 HTML 的函数
|
|
32
|
+
) -> Dict[str, Any]:
|
|
33
|
+
"""通用分页表格抓取(支持并发)"""
|
|
34
|
+
|
|
35
|
+
order_http_client = HttpClientFactory(
|
|
36
|
+
protocol=protocol if protocol == "http" else "https",
|
|
37
|
+
domain=domain,
|
|
38
|
+
timeout=timeout,
|
|
39
|
+
retry=retry,
|
|
40
|
+
enable_log=enable_log,
|
|
41
|
+
cookie_jar=cookie_jar,
|
|
42
|
+
playwright_state=playwright_state
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# --- 1. 先拿第一页(串行) ---
|
|
46
|
+
response = await fetch_page_fn(
|
|
47
|
+
domain=domain, protocol=protocol, retry=retry, timeout=timeout,
|
|
48
|
+
enable_log=enable_log, cookie_jar=cookie_jar, playwright_state=playwright_state,
|
|
49
|
+
order_http_client=order_http_client, is_end=True
|
|
50
|
+
)
|
|
51
|
+
if response.get("code") != 200:
|
|
52
|
+
return response
|
|
53
|
+
|
|
54
|
+
html = response["data"]
|
|
55
|
+
table_data: List[Dict[str, Any]] = parse_order_table(html=html, table_state=table_state)
|
|
56
|
+
|
|
57
|
+
pagination_info = parse_pagination_info(html)
|
|
58
|
+
pages = pagination_info.get("pages", 1)
|
|
59
|
+
|
|
60
|
+
# --- 2. 如果只有 1 页,直接返回 ---
|
|
61
|
+
if pages <= 1:
|
|
62
|
+
pagination_info.update({
|
|
63
|
+
"data": table_data,
|
|
64
|
+
"is_next_page": False,
|
|
65
|
+
"page_size": len(table_data),
|
|
66
|
+
"pages": 1
|
|
67
|
+
})
|
|
68
|
+
response["data"] = pagination_info
|
|
69
|
+
return response
|
|
70
|
+
|
|
71
|
+
# --- 3. 多页:并发抓取第 2~pages 页 ---
|
|
72
|
+
async def fetch_page(client: HttpClientFactory, page: int) -> List[Optional[Dict[str, Any]]]:
|
|
73
|
+
"""单页抓取任务,用于并发调度"""
|
|
74
|
+
try:
|
|
75
|
+
resp = await fetch_page_fn(
|
|
76
|
+
domain=domain, protocol=protocol, retry=retry, timeout=timeout,
|
|
77
|
+
enable_log=enable_log, cookie_jar=cookie_jar, playwright_state=playwright_state,
|
|
78
|
+
order_http_client=client, current_page=page, pages=pages, is_end=(page == pages)
|
|
79
|
+
)
|
|
80
|
+
if resp.get("code") == 200:
|
|
81
|
+
return parse_order_table(html=resp["data"], table_state=table_state)
|
|
82
|
+
except (Exception, ):
|
|
83
|
+
return list() # 抓取失败则返回空,不影响整体
|
|
84
|
+
return list()
|
|
85
|
+
|
|
86
|
+
# 🔥 并发:一口气抓全部分页
|
|
87
|
+
order_http_client = HttpClientFactory(
|
|
88
|
+
protocol=protocol if protocol == "http" else "https",
|
|
89
|
+
domain=domain,
|
|
90
|
+
timeout=timeout,
|
|
91
|
+
retry=retry,
|
|
92
|
+
enable_log=enable_log,
|
|
93
|
+
cookie_jar=cookie_jar,
|
|
94
|
+
playwright_state=playwright_state
|
|
95
|
+
)
|
|
96
|
+
tasks = [fetch_page(client=order_http_client, page=page) for page in range(2, pages + 1)]
|
|
97
|
+
results = await asyncio.gather(*tasks)
|
|
98
|
+
|
|
99
|
+
# 合并表格数据
|
|
100
|
+
for r in results:
|
|
101
|
+
if r:
|
|
102
|
+
table_data.extend(r)
|
|
103
|
+
|
|
104
|
+
# --- 4. 构造最终返回数据 ---
|
|
105
|
+
pagination_info.update({
|
|
106
|
+
"data": table_data,
|
|
107
|
+
"is_next_page": False,
|
|
108
|
+
"page_size": len(table_data),
|
|
109
|
+
"pages": 1
|
|
110
|
+
})
|
|
111
|
+
response["data"] = pagination_info
|
|
112
|
+
return response
|
|
113
|
+
|
|
114
|
+
async def get_domestic_activity_order_table(
|
|
115
|
+
domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
|
|
116
|
+
cookie_jar: Optional[CookieJar] = None, playwright_state: Dict[str, Any] = None
|
|
117
|
+
) -> Dict[str, Any]:
|
|
118
|
+
return await _get_paginated_order_table(
|
|
119
|
+
domain=domain,
|
|
120
|
+
protocol=protocol,
|
|
121
|
+
retry=retry,
|
|
122
|
+
timeout=timeout,
|
|
123
|
+
enable_log=enable_log,
|
|
124
|
+
cookie_jar=cookie_jar,
|
|
125
|
+
playwright_state=playwright_state,
|
|
126
|
+
table_state="proccessing",
|
|
127
|
+
fetch_page_fn=get_domestic_activity_order_page_html
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def get_domestic_ticket_outed_table(
|
|
132
|
+
domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
|
|
133
|
+
cookie_jar: Optional[CookieJar] = None, playwright_state: Dict[str, Any] = None
|
|
134
|
+
) -> Dict[str, Any]:
|
|
135
|
+
return await _get_paginated_order_table(
|
|
136
|
+
domain=domain,
|
|
137
|
+
protocol=protocol,
|
|
138
|
+
retry=retry,
|
|
139
|
+
timeout=timeout,
|
|
140
|
+
enable_log=enable_log,
|
|
141
|
+
cookie_jar=cookie_jar,
|
|
142
|
+
playwright_state=playwright_state,
|
|
143
|
+
table_state="completed",
|
|
144
|
+
fetch_page_fn=get_domestic_ticket_outed_page_html
|
|
145
|
+
)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
4
|
+
# ProjectName: qlv-helper
|
|
5
|
+
# FileName: user_login.py
|
|
6
|
+
# Description: 用户登录页面控制器
|
|
7
|
+
# Author: ASUS
|
|
8
|
+
# CreateDate: 2025/11/25
|
|
9
|
+
# Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
|
|
10
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
11
|
+
"""
|
|
12
|
+
import asyncio
|
|
13
|
+
from typing import Tuple
|
|
14
|
+
from qlv_helper.po.login_page import LoginPage
|
|
15
|
+
from playwright.async_api import BrowserContext
|
|
16
|
+
from qlv_helper.po.wechat_auth_page import WechatAuthPage
|
|
17
|
+
from qlv_helper.utils.browser_utils import switch_for_table_window
|
|
18
|
+
from qlv_helper.utils.po_utils import on_click_locator, locator_input_element
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def _username_login(login_po: LoginPage, username: str, password: str, timeout: float = 5.0) -> Tuple[bool, str]:
|
|
22
|
+
# 1. 输入用户名
|
|
23
|
+
is_success, username_input = await login_po.get_login_username_input(timeout=timeout)
|
|
24
|
+
if is_success is False:
|
|
25
|
+
return is_success, username_input
|
|
26
|
+
await locator_input_element(locator=username_input, text=username.strip())
|
|
27
|
+
|
|
28
|
+
# 2. 输入密码
|
|
29
|
+
is_success, password_input = await login_po.get_login_password_input(timeout=timeout)
|
|
30
|
+
if is_success is False:
|
|
31
|
+
return is_success, username_input
|
|
32
|
+
await locator_input_element(locator=password_input, text=password.strip())
|
|
33
|
+
|
|
34
|
+
# 3. 获取一层验证码
|
|
35
|
+
is_success, code_str = await login_po.get_number_code(timeout=timeout)
|
|
36
|
+
if is_success is False:
|
|
37
|
+
return is_success, code_str
|
|
38
|
+
|
|
39
|
+
# 4. 输入一层验证码
|
|
40
|
+
is_success, code_input = await login_po.get_login_number_code_input(timeout=timeout)
|
|
41
|
+
if is_success is False:
|
|
42
|
+
return is_success, code_input
|
|
43
|
+
await locator_input_element(locator=code_input, text=code_str.lower())
|
|
44
|
+
|
|
45
|
+
# 5. 点击登录
|
|
46
|
+
is_success, login_btn = await login_po.get_login_btn(timeout=timeout)
|
|
47
|
+
if is_success is False:
|
|
48
|
+
return is_success, login_btn
|
|
49
|
+
await on_click_locator(locator=login_btn)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def _wechat_login(browser: BrowserContext, login_po: LoginPage, timeout: float = 5.0) -> Tuple[bool, str]:
|
|
53
|
+
# 1. 点击微信登录快捷入口
|
|
54
|
+
is_success, wechat_entrance = await login_po.get_wechat_entrance(timeout=timeout)
|
|
55
|
+
if is_success is False:
|
|
56
|
+
return is_success, wechat_entrance
|
|
57
|
+
await on_click_locator(locator=wechat_entrance)
|
|
58
|
+
|
|
59
|
+
page_new = await switch_for_table_window(browser=browser, url_keyword="open.weixin.qq.com", wait_time=int(timeout))
|
|
60
|
+
wachat_po = WechatAuthPage(page=page_new)
|
|
61
|
+
|
|
62
|
+
# 2. 点击【微信快捷登录】按钮
|
|
63
|
+
is_success, wechat_quick_login_btn = await wachat_po.get_wechat_quick_login_btn(timeout=timeout)
|
|
64
|
+
if is_success is False:
|
|
65
|
+
return is_success, wechat_quick_login_btn
|
|
66
|
+
await on_click_locator(locator=wechat_quick_login_btn)
|
|
67
|
+
|
|
68
|
+
# 3. 点击微信弹框的中【允许】按钮
|
|
69
|
+
return await wachat_po.on_click_allow_btn(timeout=int(timeout) * 3)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def username_login(
|
|
73
|
+
login_po: LoginPage, username: str, password: str, timeout: float = 5.0, retry: int = 3
|
|
74
|
+
) -> Tuple[bool, str]:
|
|
75
|
+
# 1. 第一次全流程的登录
|
|
76
|
+
await _username_login(login_po=login_po, username=username, password=password, timeout=timeout)
|
|
77
|
+
for _ in range(retry):
|
|
78
|
+
# 2. 判断是否为当前页
|
|
79
|
+
if login_po.is_current_page() is False:
|
|
80
|
+
return True, f"账号:{username} 登录成功"
|
|
81
|
+
|
|
82
|
+
# 3. 判断是否存在登录警告,存在的话,继续输入验证码,再次登录
|
|
83
|
+
is_warn: bool = await login_po.is_exist_login_warn(timeout=timeout)
|
|
84
|
+
if is_warn is True:
|
|
85
|
+
# 4. 获取一层验证码
|
|
86
|
+
is_success, code_str = await login_po.get_number_code(timeout=timeout)
|
|
87
|
+
if is_success is False:
|
|
88
|
+
return is_success, code_str
|
|
89
|
+
|
|
90
|
+
# 5. 输入一层验证码
|
|
91
|
+
is_success, code_input = await login_po.get_login_number_code_input(timeout=timeout)
|
|
92
|
+
if is_success is False:
|
|
93
|
+
return is_success, code_input
|
|
94
|
+
await locator_input_element(locator=code_input, text=code_str.lower())
|
|
95
|
+
|
|
96
|
+
# 6. 点击登录
|
|
97
|
+
is_success, login_btn = await login_po.get_login_btn(timeout=timeout)
|
|
98
|
+
if is_success is False:
|
|
99
|
+
return is_success, login_btn
|
|
100
|
+
await on_click_locator(locator=login_btn)
|
|
101
|
+
else:
|
|
102
|
+
# 7. 重复一次全流程的登录
|
|
103
|
+
await _username_login(login_po=login_po, username=username, password=password, timeout=timeout)
|
|
104
|
+
|
|
105
|
+
await asyncio.sleep(delay=timeout)
|
|
106
|
+
|
|
107
|
+
return True, f"账号:{username} 一次登录流程结束"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
async def wechat_login(
|
|
111
|
+
browser: BrowserContext, login_po: LoginPage, timeout: float = 5.0, retry: int = 3
|
|
112
|
+
) -> Tuple[bool, str]:
|
|
113
|
+
for index in range(retry):
|
|
114
|
+
# 全流程的登录
|
|
115
|
+
is_success, message = await _wechat_login(browser=browser, login_po=login_po, timeout=timeout)
|
|
116
|
+
|
|
117
|
+
# 判断是否为当前页
|
|
118
|
+
if is_success is True or index == retry - 1:
|
|
119
|
+
return is_success, message
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
4
|
+
# ProjectName: qlv-helper
|
|
5
|
+
# FileName: __init__.py
|
|
6
|
+
# Description: http协议相关的处理包
|
|
7
|
+
# Author: ASUS
|
|
8
|
+
# CreateDate: 2025/11/28
|
|
9
|
+
# Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
|
|
10
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
11
|
+
"""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
4
|
+
# ProjectName: qlv-helper
|
|
5
|
+
# FileName: main_page.py
|
|
6
|
+
# Description: 首页的HTTP响应处理模块
|
|
7
|
+
# Author: ASUS
|
|
8
|
+
# CreateDate: 2025/11/29
|
|
9
|
+
# Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
|
|
10
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
11
|
+
"""
|
|
12
|
+
import aiohttp
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
from typing import Dict, Any, Optional
|
|
15
|
+
from http_helper.client.async_proxy import HttpClientFactory
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def get_main_page_html(
|
|
19
|
+
domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
|
|
20
|
+
cookie_jar: Optional[aiohttp.CookieJar] = None, playwright_state: Dict[str, Any] = None
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
|
+
order_http_client = HttpClientFactory(
|
|
23
|
+
protocol=protocol if protocol == "http" else "https",
|
|
24
|
+
domain=domain,
|
|
25
|
+
timeout=timeout,
|
|
26
|
+
retry=retry,
|
|
27
|
+
enable_log=enable_log,
|
|
28
|
+
cookie_jar=cookie_jar,
|
|
29
|
+
playwright_state=playwright_state
|
|
30
|
+
)
|
|
31
|
+
return await order_http_client.request(method="get", url="/", is_end=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parser_head_title(html: str) -> str:
|
|
35
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
36
|
+
# 提取 title 文本
|
|
37
|
+
title_tag = soup.find('title')
|
|
38
|
+
if title_tag:
|
|
39
|
+
return title_tag.get_text().strip()
|
|
40
|
+
else:
|
|
41
|
+
return ""
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
4
|
+
# ProjectName: qlv-helper
|
|
5
|
+
# FileName: order_page.py
|
|
6
|
+
# Description: 订单页的HTTP响应处理模块
|
|
7
|
+
# Author: ASUS
|
|
8
|
+
# CreateDate: 2025/11/28
|
|
9
|
+
# Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
|
|
10
|
+
# ---------------------------------------------------------------------------------------------------------
|
|
11
|
+
"""
|
|
12
|
+
import re
|
|
13
|
+
import aiohttp
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from bs4 import BeautifulSoup, Tag
|
|
16
|
+
from collections import OrderedDict
|
|
17
|
+
from typing import Dict, Any, Optional, List
|
|
18
|
+
from qlv_helper.utils.type_utils import convert_cn_to_en
|
|
19
|
+
from http_helper.client.async_proxy import HttpClientFactory
|
|
20
|
+
from qlv_helper.utils.datetime_utils import get_current_dtstr
|
|
21
|
+
from qlv_helper.utils.type_utils import get_key_by_index, get_value_by_index, safe_convert_advanced
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def get_order_page_html(
|
|
25
|
+
order_id: int, domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
|
|
26
|
+
cookie_jar: Optional[aiohttp.CookieJar] = None, playwright_state: Dict[str, Any] = None
|
|
27
|
+
) -> Dict[str, Any]:
|
|
28
|
+
order_http_client = HttpClientFactory(
|
|
29
|
+
protocol=protocol if protocol == "http" else "https",
|
|
30
|
+
domain=domain,
|
|
31
|
+
timeout=timeout,
|
|
32
|
+
retry=retry,
|
|
33
|
+
enable_log=enable_log,
|
|
34
|
+
cookie_jar=cookie_jar,
|
|
35
|
+
playwright_state=playwright_state
|
|
36
|
+
)
|
|
37
|
+
return await order_http_client.request(
|
|
38
|
+
method="get",
|
|
39
|
+
url=f"/OrderProcessing/NewTicket_show/{order_id}?&r={get_current_dtstr()}",
|
|
40
|
+
is_end=True
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def order_info_static_headers() -> OrderedDict[str, str]:
|
|
45
|
+
return OrderedDict([
|
|
46
|
+
("receipted_ota", "OTA实收"), # 0
|
|
47
|
+
("kickback", "佣金"), # 1
|
|
48
|
+
("raw_order_no", "平台订单号"), # 2
|
|
49
|
+
("trip_type", "行程类型"), # 3
|
|
50
|
+
("id", "订单号"), # 4
|
|
51
|
+
("stat_opration", "订单操作"), # 5
|
|
52
|
+
("stat_order", "订单状态") # 6
|
|
53
|
+
])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parser_order_info(html: str) -> Dict[str, Any]:
|
|
57
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
58
|
+
# 找到目标 table
|
|
59
|
+
table = soup.find("table", class_="table no_border")
|
|
60
|
+
if not table:
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
# 所有 td
|
|
64
|
+
tds = table.find_all("td")
|
|
65
|
+
result = {}
|
|
66
|
+
|
|
67
|
+
for td in tds:
|
|
68
|
+
text = td.get_text(strip=True)
|
|
69
|
+
# 如果 td 内没有冒号、也不是按钮,跳过
|
|
70
|
+
if ":" not in text:
|
|
71
|
+
continue
|
|
72
|
+
# 按 ":" 进行分割
|
|
73
|
+
try:
|
|
74
|
+
key, value = text.split(":", 1)
|
|
75
|
+
except (Exception, ValueError):
|
|
76
|
+
continue
|
|
77
|
+
# 去掉换行符、空格
|
|
78
|
+
key = key.strip()
|
|
79
|
+
value = value.strip()
|
|
80
|
+
|
|
81
|
+
# 如果 value 为空,尝试取 <b> 或其他控件内文本
|
|
82
|
+
if not value:
|
|
83
|
+
b = td.find("b")
|
|
84
|
+
if b:
|
|
85
|
+
value = b.get_text(strip=True)
|
|
86
|
+
|
|
87
|
+
# 去掉尾部不需要的空格
|
|
88
|
+
value = value.replace("\u00a0", "").strip()
|
|
89
|
+
value = value[1:] if isinstance(value, str) and value.startswith("[") else value
|
|
90
|
+
value = value[:-1] if isinstance(value, str) and value.endswith("]") else value
|
|
91
|
+
result[key] = safe_convert_advanced(value)
|
|
92
|
+
return convert_cn_to_en(data=result, header_map=order_info_static_headers())
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def flight_static_headers() -> OrderedDict[str, str]:
|
|
96
|
+
return OrderedDict([
|
|
97
|
+
("ticket_state", "票状态"), # 0
|
|
98
|
+
("passenger_info", "乘客信息"), # 1
|
|
99
|
+
("price_std", "票面价"), # 2
|
|
100
|
+
("price_sell", "销售价"), # 3
|
|
101
|
+
("tax_air", "机建费"), # 4
|
|
102
|
+
("tax_fuel", "燃油费"), # 5
|
|
103
|
+
("pnr", "PNR"), # 6
|
|
104
|
+
("itinerary", "行程"), # 7
|
|
105
|
+
("ticket_no", "票号"), # 8
|
|
106
|
+
("itinerary_no", "行程单号"), # 9
|
|
107
|
+
])
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def flight_extend_headers() -> OrderedDict[str, str]:
|
|
111
|
+
return OrderedDict([
|
|
112
|
+
("p_name", "姓名"), # 0
|
|
113
|
+
("p_type", "类型"), # 1
|
|
114
|
+
("id_type", "证件类型"), # 2
|
|
115
|
+
("id_no", "身份证"), # 3
|
|
116
|
+
("birth_day", "出生年月"), # 4
|
|
117
|
+
("age", "年龄"), # 5
|
|
118
|
+
("gender", "性别"), # 6
|
|
119
|
+
("new_nation", "国籍"), # 7
|
|
120
|
+
("card_issue_place", "签发国"), # 8
|
|
121
|
+
("id_valid_dat", " 证件有效期"), # 9
|
|
122
|
+
("code_dep", " 起飞机场"), # 10
|
|
123
|
+
("code_arr", " 抵达机场"), # 11
|
|
124
|
+
])
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def parse_order_flight_table_headers(html: Tag) -> OrderedDict[str, str]:
|
|
128
|
+
"""解析航班表表头"""
|
|
129
|
+
headers = OrderedDict()
|
|
130
|
+
for th in html.find_all("th"):
|
|
131
|
+
# 获取直接文本,不要子标签中的内容
|
|
132
|
+
direct_texts = th.find_all(text=True, recursive=False)
|
|
133
|
+
header = "".join(t.strip() for t in direct_texts if t.strip()) or th.get_text(strip=True)
|
|
134
|
+
headers[header] = header
|
|
135
|
+
return headers
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def clean_order_flight_table(html: Tag) -> str:
|
|
139
|
+
"""清理航班表html内容多余的信息"""
|
|
140
|
+
# 移除 script / style / hidden
|
|
141
|
+
for tag in html.find_all(["script", "style"]):
|
|
142
|
+
tag.extract()
|
|
143
|
+
|
|
144
|
+
# 尝试去掉 display:none 的标签
|
|
145
|
+
for tag in html.find_all(style=True):
|
|
146
|
+
style = tag["style"].replace(" ", "").lower()
|
|
147
|
+
if "display:none" in style:
|
|
148
|
+
tag.extract()
|
|
149
|
+
|
|
150
|
+
return html.get_text(strip=True)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def parse_order_flight_table_passenger_info(raw: Tag, headers: OrderedDict[str, str]) -> Dict[str, Any]:
|
|
154
|
+
"""
|
|
155
|
+
解析包含 copyFn(...) 的乘客信息 TD,返回 dict
|
|
156
|
+
"""
|
|
157
|
+
# -------------------------------
|
|
158
|
+
# 1. 解析 <img onclick="copyFn('吴世勇||身份证|140104194702241336|男||1947/2/24 0:00:00||1900/1/1 0:00:00')" src="/images/nav_assistant.png" style="width:20px; height:20px; margin:0px;"/>
|
|
159
|
+
# -------------------------------
|
|
160
|
+
img = raw.find("img", onclick=True)
|
|
161
|
+
if not img:
|
|
162
|
+
return {}
|
|
163
|
+
|
|
164
|
+
# 提取里面的参数内容
|
|
165
|
+
onclick = img["onclick"]
|
|
166
|
+
m = re.search(r"copyFn\('(.+?)'\)", onclick)
|
|
167
|
+
if not m:
|
|
168
|
+
return {}
|
|
169
|
+
|
|
170
|
+
group = m.group(1) # "吴世勇||身份证|140104194702241336|男||1947/2/24 0:00:00||1900/1/1 0:00:00"
|
|
171
|
+
parts = group.split("|")
|
|
172
|
+
|
|
173
|
+
# 字段参考:
|
|
174
|
+
# 0: 姓名
|
|
175
|
+
# 1: 空
|
|
176
|
+
# 2: 证件类型
|
|
177
|
+
# 3: 证件号
|
|
178
|
+
# 4: 性别
|
|
179
|
+
# 5: 空
|
|
180
|
+
# 6: 出生日期
|
|
181
|
+
# 7: 空
|
|
182
|
+
# 8: 证件有效期(可能无)
|
|
183
|
+
name = parts[0]
|
|
184
|
+
id_type = parts[2]
|
|
185
|
+
id_no_raw = parts[3]
|
|
186
|
+
sex = parts[4]
|
|
187
|
+
birth_raw = parts[6]
|
|
188
|
+
id_valid_raw = parts[8] if len(parts) > 8 else ""
|
|
189
|
+
|
|
190
|
+
# -------------------------------
|
|
191
|
+
# 2. 证件号加掩码(如页面显示一致)
|
|
192
|
+
# -------------------------------
|
|
193
|
+
# if len(id_no_raw) >= 8:
|
|
194
|
+
# id_no_masked = id_no_raw[:6] + "****" + id_no_raw[-4:]
|
|
195
|
+
# else:
|
|
196
|
+
# id_no_masked = id_no_raw
|
|
197
|
+
|
|
198
|
+
# -------------------------------
|
|
199
|
+
# 3. 出生日期格式化 1947/2/24 → 1947-02-24
|
|
200
|
+
# -------------------------------
|
|
201
|
+
def fmt_date(value: str) -> str:
|
|
202
|
+
try:
|
|
203
|
+
dt = datetime.strptime(value.split(" ")[0], "%Y/%m/%d")
|
|
204
|
+
return dt.strftime("%Y-%m-%d")
|
|
205
|
+
except (Exception,):
|
|
206
|
+
return ""
|
|
207
|
+
|
|
208
|
+
birth = fmt_date(birth_raw)
|
|
209
|
+
id_valid = fmt_date(id_valid_raw)
|
|
210
|
+
|
|
211
|
+
# -------------------------------
|
|
212
|
+
# 4. 年龄计算
|
|
213
|
+
# -------------------------------
|
|
214
|
+
def calc_age(birth_str: str) -> Optional[int]:
|
|
215
|
+
try:
|
|
216
|
+
birthday = datetime.strptime(birth_str, "%Y-%m-%d")
|
|
217
|
+
today = datetime.today()
|
|
218
|
+
return today.year - birthday.year - ((today.month, today.day) < (birthday.month, birthday.day))
|
|
219
|
+
except (Exception,):
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
age = calc_age(birth)
|
|
223
|
+
|
|
224
|
+
# -------------------------------
|
|
225
|
+
# 5. 类型(成人/儿童)需要从页面中取
|
|
226
|
+
# -------------------------------
|
|
227
|
+
type_span = raw.find(name="span", text=re.compile(r"(成人|儿童|婴儿)"))
|
|
228
|
+
ptype = ""
|
|
229
|
+
if type_span:
|
|
230
|
+
ptype = type_span.get_text(strip=True).replace("【", "").replace("】", "")
|
|
231
|
+
|
|
232
|
+
# -------------------------------
|
|
233
|
+
# 6. 国籍、签发国 从 name="guobie" 的两个 span 获取
|
|
234
|
+
# -------------------------------
|
|
235
|
+
guobies = raw.find_all("span", attrs={"name": "guobie"})
|
|
236
|
+
nationality = guobies[0].get_text(strip=True) if len(guobies) > 0 else ""
|
|
237
|
+
issue_country = guobies[1].get_text(strip=True) if len(guobies) > 1 else ""
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
get_key_by_index(index=0, ordered_dict=headers): name, # 姓名
|
|
241
|
+
get_key_by_index(index=1, ordered_dict=headers): ptype, # 类型: 成人/儿童
|
|
242
|
+
get_key_by_index(index=2, ordered_dict=headers): id_type, # 证件类型
|
|
243
|
+
get_key_by_index(index=3, ordered_dict=headers): id_no_raw, # 身份证
|
|
244
|
+
get_key_by_index(index=4, ordered_dict=headers): birth, # 出生年月
|
|
245
|
+
get_key_by_index(index=5, ordered_dict=headers): age, # 年龄
|
|
246
|
+
get_key_by_index(index=6, ordered_dict=headers): sex, # 性别
|
|
247
|
+
get_key_by_index(index=7, ordered_dict=headers): nationality, # 国籍
|
|
248
|
+
get_key_by_index(index=8, ordered_dict=headers): issue_country, # 签发国
|
|
249
|
+
get_key_by_index(index=9, ordered_dict=headers): id_valid # 证件有效期
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def parse_order_flight_table_row(
|
|
254
|
+
tr: Tag, headers: OrderedDict, extend_headers: OrderedDict
|
|
255
|
+
) -> Dict[str, Any]:
|
|
256
|
+
"""解析航班表每一行的数据"""
|
|
257
|
+
tds = tr.find_all("td", recursive=False)
|
|
258
|
+
values = {}
|
|
259
|
+
|
|
260
|
+
for idx, td in enumerate(tds):
|
|
261
|
+
if idx >= len(headers):
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
key = get_key_by_index(ordered_dict=headers, index=idx)
|
|
265
|
+
value = get_value_by_index(ordered_dict=headers, index=idx)
|
|
266
|
+
# 如果是 “乘客信息” → 使用结构化解析
|
|
267
|
+
if "乘客" in value or "乘客信息" in value:
|
|
268
|
+
passenger_info = parse_order_flight_table_passenger_info(raw=td, headers=extend_headers)
|
|
269
|
+
values.update(passenger_info)
|
|
270
|
+
else:
|
|
271
|
+
raw = clean_order_flight_table(html=td)
|
|
272
|
+
if "行程" in value:
|
|
273
|
+
code_dep_key = get_key_by_index(index=10, ordered_dict=extend_headers)
|
|
274
|
+
code_arr_key = get_key_by_index(index=11, ordered_dict=extend_headers)
|
|
275
|
+
raw_slice = raw.split("-")
|
|
276
|
+
if len(raw_slice) == 2:
|
|
277
|
+
values[code_dep_key] = raw_slice[0]
|
|
278
|
+
values[code_arr_key] = raw_slice[-1]
|
|
279
|
+
elif "PNR" in value:
|
|
280
|
+
values[key] = raw
|
|
281
|
+
else:
|
|
282
|
+
values[key] = safe_convert_advanced(raw)
|
|
283
|
+
|
|
284
|
+
return values
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def extract_structured_table_data(table: Tag) -> List[Optional[Dict[str, Any]]]:
|
|
288
|
+
"""提取结构化的表格数据"""
|
|
289
|
+
# headers = parse_order_flight_table_headers(html=table)
|
|
290
|
+
headers = flight_static_headers()
|
|
291
|
+
extend = flight_extend_headers()
|
|
292
|
+
rows = list()
|
|
293
|
+
for tr in table.find_all("tr")[1:]: # 跳过表头
|
|
294
|
+
rows.append(parse_order_flight_table_row(tr=tr, headers=headers, extend_headers=extend))
|
|
295
|
+
|
|
296
|
+
return rows
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def parser_order_flight_table(html: str) -> List[Optional[Dict[str, Any]]]:
|
|
300
|
+
"""解析航班表"""
|
|
301
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
302
|
+
# 三个主要的order_sort div
|
|
303
|
+
order_sections = soup.find_all('div', class_='order_sort')
|
|
304
|
+
section = order_sections[3] if len(order_sections) > 3 else Tag(name="")
|
|
305
|
+
results = list()
|
|
306
|
+
|
|
307
|
+
tables = section.find_all('table', class_='table table_border table_center')
|
|
308
|
+
for table in tables:
|
|
309
|
+
table_data = extract_structured_table_data(table)
|
|
310
|
+
if table_data:
|
|
311
|
+
results.extend(table_data)
|
|
312
|
+
|
|
313
|
+
return results
|