python-qlv-helper 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # ---------------------------------------------------------------------------------------------------------
4
+ # ProjectName: qlv-helper
5
+ # FileName: order_table.py
6
+ # Description: 订单列表页面控制器
7
+ # Author: ASUS
8
+ # CreateDate: 2025/12/01
9
+ # Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
10
+ # ---------------------------------------------------------------------------------------------------------
11
+ """
12
+ import asyncio
13
+ from aiohttp import CookieJar
14
+ from typing import Optional, Dict, Any, Callable, List
15
+ from http_helper.client.async_proxy import HttpClientFactory
16
+ from qlv_helper.utils.html_utils import parse_pagination_info
17
+ from qlv_helper.http.order_table_page import get_domestic_activity_order_page_html, get_domestic_ticket_outed_page_html, \
18
+ parse_order_table
19
+
20
+
21
+ async def _get_paginated_order_table(
22
+ *,
23
+ domain: str,
24
+ protocol: str,
25
+ retry: int,
26
+ timeout: int,
27
+ enable_log: bool,
28
+ cookie_jar: Optional[CookieJar],
29
+ playwright_state: Dict[str, Any],
30
+ table_state: str,
31
+ fetch_page_fn: Callable[..., Any], # 拿到第一页/分页 HTML 的函数
32
+ ) -> Dict[str, Any]:
33
+ """通用分页表格抓取(支持并发)"""
34
+
35
+ order_http_client = HttpClientFactory(
36
+ protocol=protocol if protocol == "http" else "https",
37
+ domain=domain,
38
+ timeout=timeout,
39
+ retry=retry,
40
+ enable_log=enable_log,
41
+ cookie_jar=cookie_jar,
42
+ playwright_state=playwright_state
43
+ )
44
+
45
+ # --- 1. 先拿第一页(串行) ---
46
+ response = await fetch_page_fn(
47
+ domain=domain, protocol=protocol, retry=retry, timeout=timeout,
48
+ enable_log=enable_log, cookie_jar=cookie_jar, playwright_state=playwright_state,
49
+ order_http_client=order_http_client, is_end=True
50
+ )
51
+ if response.get("code") != 200:
52
+ return response
53
+
54
+ html = response["data"]
55
+ table_data: List[Dict[str, Any]] = parse_order_table(html=html, table_state=table_state)
56
+
57
+ pagination_info = parse_pagination_info(html)
58
+ pages = pagination_info.get("pages", 1)
59
+
60
+ # --- 2. 如果只有 1 页,直接返回 ---
61
+ if pages <= 1:
62
+ pagination_info.update({
63
+ "data": table_data,
64
+ "is_next_page": False,
65
+ "page_size": len(table_data),
66
+ "pages": 1
67
+ })
68
+ response["data"] = pagination_info
69
+ return response
70
+
71
+ # --- 3. 多页:并发抓取第 2~pages 页 ---
72
+ async def fetch_page(client: HttpClientFactory, page: int) -> List[Optional[Dict[str, Any]]]:
73
+ """单页抓取任务,用于并发调度"""
74
+ try:
75
+ resp = await fetch_page_fn(
76
+ domain=domain, protocol=protocol, retry=retry, timeout=timeout,
77
+ enable_log=enable_log, cookie_jar=cookie_jar, playwright_state=playwright_state,
78
+ order_http_client=client, current_page=page, pages=pages, is_end=(page == pages)
79
+ )
80
+ if resp.get("code") == 200:
81
+ return parse_order_table(html=resp["data"], table_state=table_state)
82
+ except (Exception, ):
83
+ return list() # 抓取失败则返回空,不影响整体
84
+ return list()
85
+
86
+ # 🔥 并发:一口气抓全部分页
87
+ order_http_client = HttpClientFactory(
88
+ protocol=protocol if protocol == "http" else "https",
89
+ domain=domain,
90
+ timeout=timeout,
91
+ retry=retry,
92
+ enable_log=enable_log,
93
+ cookie_jar=cookie_jar,
94
+ playwright_state=playwright_state
95
+ )
96
+ tasks = [fetch_page(client=order_http_client, page=page) for page in range(2, pages + 1)]
97
+ results = await asyncio.gather(*tasks)
98
+
99
+ # 合并表格数据
100
+ for r in results:
101
+ if r:
102
+ table_data.extend(r)
103
+
104
+ # --- 4. 构造最终返回数据 ---
105
+ pagination_info.update({
106
+ "data": table_data,
107
+ "is_next_page": False,
108
+ "page_size": len(table_data),
109
+ "pages": 1
110
+ })
111
+ response["data"] = pagination_info
112
+ return response
113
+
114
+ async def get_domestic_activity_order_table(
115
+ domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
116
+ cookie_jar: Optional[CookieJar] = None, playwright_state: Dict[str, Any] = None
117
+ ) -> Dict[str, Any]:
118
+ return await _get_paginated_order_table(
119
+ domain=domain,
120
+ protocol=protocol,
121
+ retry=retry,
122
+ timeout=timeout,
123
+ enable_log=enable_log,
124
+ cookie_jar=cookie_jar,
125
+ playwright_state=playwright_state,
126
+ table_state="proccessing",
127
+ fetch_page_fn=get_domestic_activity_order_page_html
128
+ )
129
+
130
+
131
+ async def get_domestic_ticket_outed_table(
132
+ domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
133
+ cookie_jar: Optional[CookieJar] = None, playwright_state: Dict[str, Any] = None
134
+ ) -> Dict[str, Any]:
135
+ return await _get_paginated_order_table(
136
+ domain=domain,
137
+ protocol=protocol,
138
+ retry=retry,
139
+ timeout=timeout,
140
+ enable_log=enable_log,
141
+ cookie_jar=cookie_jar,
142
+ playwright_state=playwright_state,
143
+ table_state="completed",
144
+ fetch_page_fn=get_domestic_ticket_outed_page_html
145
+ )
@@ -0,0 +1,119 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # ---------------------------------------------------------------------------------------------------------
4
+ # ProjectName: qlv-helper
5
+ # FileName: user_login.py
6
+ # Description: 用户登录页面控制器
7
+ # Author: ASUS
8
+ # CreateDate: 2025/11/25
9
+ # Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
10
+ # ---------------------------------------------------------------------------------------------------------
11
+ """
12
+ import asyncio
13
+ from typing import Tuple
14
+ from qlv_helper.po.login_page import LoginPage
15
+ from playwright.async_api import BrowserContext
16
+ from qlv_helper.po.wechat_auth_page import WechatAuthPage
17
+ from qlv_helper.utils.browser_utils import switch_for_table_window
18
+ from qlv_helper.utils.po_utils import on_click_locator, locator_input_element
19
+
20
+
21
+ async def _username_login(login_po: LoginPage, username: str, password: str, timeout: float = 5.0) -> Tuple[bool, str]:
22
+ # 1. 输入用户名
23
+ is_success, username_input = await login_po.get_login_username_input(timeout=timeout)
24
+ if is_success is False:
25
+ return is_success, username_input
26
+ await locator_input_element(locator=username_input, text=username.strip())
27
+
28
+ # 2. 输入密码
29
+ is_success, password_input = await login_po.get_login_password_input(timeout=timeout)
30
+ if is_success is False:
31
+ return is_success, username_input
32
+ await locator_input_element(locator=password_input, text=password.strip())
33
+
34
+ # 3. 获取一层验证码
35
+ is_success, code_str = await login_po.get_number_code(timeout=timeout)
36
+ if is_success is False:
37
+ return is_success, code_str
38
+
39
+ # 4. 输入一层验证码
40
+ is_success, code_input = await login_po.get_login_number_code_input(timeout=timeout)
41
+ if is_success is False:
42
+ return is_success, code_input
43
+ await locator_input_element(locator=code_input, text=code_str.lower())
44
+
45
+ # 5. 点击登录
46
+ is_success, login_btn = await login_po.get_login_btn(timeout=timeout)
47
+ if is_success is False:
48
+ return is_success, login_btn
49
+ await on_click_locator(locator=login_btn)
50
+
51
+
52
+ async def _wechat_login(browser: BrowserContext, login_po: LoginPage, timeout: float = 5.0) -> Tuple[bool, str]:
53
+ # 1. 点击微信登录快捷入口
54
+ is_success, wechat_entrance = await login_po.get_wechat_entrance(timeout=timeout)
55
+ if is_success is False:
56
+ return is_success, wechat_entrance
57
+ await on_click_locator(locator=wechat_entrance)
58
+
59
+ page_new = await switch_for_table_window(browser=browser, url_keyword="open.weixin.qq.com", wait_time=int(timeout))
60
+ wachat_po = WechatAuthPage(page=page_new)
61
+
62
+ # 2. 点击【微信快捷登录】按钮
63
+ is_success, wechat_quick_login_btn = await wachat_po.get_wechat_quick_login_btn(timeout=timeout)
64
+ if is_success is False:
65
+ return is_success, wechat_quick_login_btn
66
+ await on_click_locator(locator=wechat_quick_login_btn)
67
+
68
+ # 3. 点击微信弹框的中【允许】按钮
69
+ return await wachat_po.on_click_allow_btn(timeout=int(timeout) * 3)
70
+
71
+
72
+ async def username_login(
73
+ login_po: LoginPage, username: str, password: str, timeout: float = 5.0, retry: int = 3
74
+ ) -> Tuple[bool, str]:
75
+ # 1. 第一次全流程的登录
76
+ await _username_login(login_po=login_po, username=username, password=password, timeout=timeout)
77
+ for _ in range(retry):
78
+ # 2. 判断是否为当前页
79
+ if login_po.is_current_page() is False:
80
+ return True, f"账号:{username} 登录成功"
81
+
82
+ # 3. 判断是否存在登录警告,存在的话,继续输入验证码,再次登录
83
+ is_warn: bool = await login_po.is_exist_login_warn(timeout=timeout)
84
+ if is_warn is True:
85
+ # 4. 获取一层验证码
86
+ is_success, code_str = await login_po.get_number_code(timeout=timeout)
87
+ if is_success is False:
88
+ return is_success, code_str
89
+
90
+ # 5. 输入一层验证码
91
+ is_success, code_input = await login_po.get_login_number_code_input(timeout=timeout)
92
+ if is_success is False:
93
+ return is_success, code_input
94
+ await locator_input_element(locator=code_input, text=code_str.lower())
95
+
96
+ # 6. 点击登录
97
+ is_success, login_btn = await login_po.get_login_btn(timeout=timeout)
98
+ if is_success is False:
99
+ return is_success, login_btn
100
+ await on_click_locator(locator=login_btn)
101
+ else:
102
+ # 7. 重复一次全流程的登录
103
+ await _username_login(login_po=login_po, username=username, password=password, timeout=timeout)
104
+
105
+ await asyncio.sleep(delay=timeout)
106
+
107
+ return True, f"账号:{username} 一次登录流程结束"
108
+
109
+
110
+ async def wechat_login(
111
+ browser: BrowserContext, login_po: LoginPage, timeout: float = 5.0, retry: int = 3
112
+ ) -> Tuple[bool, str]:
113
+ for index in range(retry):
114
+ # 全流程的登录
115
+ is_success, message = await _wechat_login(browser=browser, login_po=login_po, timeout=timeout)
116
+
117
+ # 判断是否为当前页
118
+ if is_success is True or index == retry - 1:
119
+ return is_success, message
@@ -0,0 +1,11 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # ---------------------------------------------------------------------------------------------------------
4
+ # ProjectName: qlv-helper
5
+ # FileName: __init__.py
6
+ # Description: http协议相关的处理包
7
+ # Author: ASUS
8
+ # CreateDate: 2025/11/28
9
+ # Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
10
+ # ---------------------------------------------------------------------------------------------------------
11
+ """
@@ -0,0 +1,41 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # ---------------------------------------------------------------------------------------------------------
4
+ # ProjectName: qlv-helper
5
+ # FileName: main_page.py
6
+ # Description: 首页的HTTP响应处理模块
7
+ # Author: ASUS
8
+ # CreateDate: 2025/11/29
9
+ # Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
10
+ # ---------------------------------------------------------------------------------------------------------
11
+ """
12
+ import aiohttp
13
+ from bs4 import BeautifulSoup
14
+ from typing import Dict, Any, Optional
15
+ from http_helper.client.async_proxy import HttpClientFactory
16
+
17
+
18
+ async def get_main_page_html(
19
+ domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
20
+ cookie_jar: Optional[aiohttp.CookieJar] = None, playwright_state: Dict[str, Any] = None
21
+ ) -> Dict[str, Any]:
22
+ order_http_client = HttpClientFactory(
23
+ protocol=protocol if protocol == "http" else "https",
24
+ domain=domain,
25
+ timeout=timeout,
26
+ retry=retry,
27
+ enable_log=enable_log,
28
+ cookie_jar=cookie_jar,
29
+ playwright_state=playwright_state
30
+ )
31
+ return await order_http_client.request(method="get", url="/", is_end=True)
32
+
33
+
34
+ def parser_head_title(html: str) -> str:
35
+ soup = BeautifulSoup(html, "html.parser")
36
+ # 提取 title 文本
37
+ title_tag = soup.find('title')
38
+ if title_tag:
39
+ return title_tag.get_text().strip()
40
+ else:
41
+ return ""
@@ -0,0 +1,313 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # ---------------------------------------------------------------------------------------------------------
4
+ # ProjectName: qlv-helper
5
+ # FileName: order_page.py
6
+ # Description: 订单页的HTTP响应处理模块
7
+ # Author: ASUS
8
+ # CreateDate: 2025/11/28
9
+ # Copyright ©2011-2025. Hunan xxxxxxx Company limited. All rights reserved.
10
+ # ---------------------------------------------------------------------------------------------------------
11
+ """
12
+ import re
13
+ import aiohttp
14
+ from datetime import datetime
15
+ from bs4 import BeautifulSoup, Tag
16
+ from collections import OrderedDict
17
+ from typing import Dict, Any, Optional, List
18
+ from qlv_helper.utils.type_utils import convert_cn_to_en
19
+ from http_helper.client.async_proxy import HttpClientFactory
20
+ from qlv_helper.utils.datetime_utils import get_current_dtstr
21
+ from qlv_helper.utils.type_utils import get_key_by_index, get_value_by_index, safe_convert_advanced
22
+
23
+
24
+ async def get_order_page_html(
25
+ order_id: int, domain: str, protocol: str = "http", retry: int = 1, timeout: int = 5, enable_log: bool = True,
26
+ cookie_jar: Optional[aiohttp.CookieJar] = None, playwright_state: Dict[str, Any] = None
27
+ ) -> Dict[str, Any]:
28
+ order_http_client = HttpClientFactory(
29
+ protocol=protocol if protocol == "http" else "https",
30
+ domain=domain,
31
+ timeout=timeout,
32
+ retry=retry,
33
+ enable_log=enable_log,
34
+ cookie_jar=cookie_jar,
35
+ playwright_state=playwright_state
36
+ )
37
+ return await order_http_client.request(
38
+ method="get",
39
+ url=f"/OrderProcessing/NewTicket_show/{order_id}?&r={get_current_dtstr()}",
40
+ is_end=True
41
+ )
42
+
43
+
44
+ def order_info_static_headers() -> OrderedDict[str, str]:
45
+ return OrderedDict([
46
+ ("receipted_ota", "OTA实收"), # 0
47
+ ("kickback", "佣金"), # 1
48
+ ("raw_order_no", "平台订单号"), # 2
49
+ ("trip_type", "行程类型"), # 3
50
+ ("id", "订单号"), # 4
51
+ ("stat_opration", "订单操作"), # 5
52
+ ("stat_order", "订单状态") # 6
53
+ ])
54
+
55
+
56
+ def parser_order_info(html: str) -> Dict[str, Any]:
57
+ soup = BeautifulSoup(html, "html.parser")
58
+ # 找到目标 table
59
+ table = soup.find("table", class_="table no_border")
60
+ if not table:
61
+ return {}
62
+
63
+ # 所有 td
64
+ tds = table.find_all("td")
65
+ result = {}
66
+
67
+ for td in tds:
68
+ text = td.get_text(strip=True)
69
+ # 如果 td 内没有冒号、也不是按钮,跳过
70
+ if ":" not in text:
71
+ continue
72
+ # 按 ":" 进行分割
73
+ try:
74
+ key, value = text.split(":", 1)
75
+ except (Exception, ValueError):
76
+ continue
77
+ # 去掉换行符、空格
78
+ key = key.strip()
79
+ value = value.strip()
80
+
81
+ # 如果 value 为空,尝试取 <b> 或其他控件内文本
82
+ if not value:
83
+ b = td.find("b")
84
+ if b:
85
+ value = b.get_text(strip=True)
86
+
87
+ # 去掉尾部不需要的空格
88
+ value = value.replace("\u00a0", "").strip()
89
+ value = value[1:] if isinstance(value, str) and value.startswith("[") else value
90
+ value = value[:-1] if isinstance(value, str) and value.endswith("]") else value
91
+ result[key] = safe_convert_advanced(value)
92
+ return convert_cn_to_en(data=result, header_map=order_info_static_headers())
93
+
94
+
95
+ def flight_static_headers() -> OrderedDict[str, str]:
96
+ return OrderedDict([
97
+ ("ticket_state", "票状态"), # 0
98
+ ("passenger_info", "乘客信息"), # 1
99
+ ("price_std", "票面价"), # 2
100
+ ("price_sell", "销售价"), # 3
101
+ ("tax_air", "机建费"), # 4
102
+ ("tax_fuel", "燃油费"), # 5
103
+ ("pnr", "PNR"), # 6
104
+ ("itinerary", "行程"), # 7
105
+ ("ticket_no", "票号"), # 8
106
+ ("itinerary_no", "行程单号"), # 9
107
+ ])
108
+
109
+
110
+ def flight_extend_headers() -> OrderedDict[str, str]:
111
+ return OrderedDict([
112
+ ("p_name", "姓名"), # 0
113
+ ("p_type", "类型"), # 1
114
+ ("id_type", "证件类型"), # 2
115
+ ("id_no", "身份证"), # 3
116
+ ("birth_day", "出生年月"), # 4
117
+ ("age", "年龄"), # 5
118
+ ("gender", "性别"), # 6
119
+ ("new_nation", "国籍"), # 7
120
+ ("card_issue_place", "签发国"), # 8
121
+ ("id_valid_dat", " 证件有效期"), # 9
122
+ ("code_dep", " 起飞机场"), # 10
123
+ ("code_arr", " 抵达机场"), # 11
124
+ ])
125
+
126
+
127
+ def parse_order_flight_table_headers(html: Tag) -> OrderedDict[str, str]:
128
+ """解析航班表表头"""
129
+ headers = OrderedDict()
130
+ for th in html.find_all("th"):
131
+ # 获取直接文本,不要子标签中的内容
132
+ direct_texts = th.find_all(text=True, recursive=False)
133
+ header = "".join(t.strip() for t in direct_texts if t.strip()) or th.get_text(strip=True)
134
+ headers[header] = header
135
+ return headers
136
+
137
+
138
+ def clean_order_flight_table(html: Tag) -> str:
139
+ """清理航班表html内容多余的信息"""
140
+ # 移除 script / style / hidden
141
+ for tag in html.find_all(["script", "style"]):
142
+ tag.extract()
143
+
144
+ # 尝试去掉 display:none 的标签
145
+ for tag in html.find_all(style=True):
146
+ style = tag["style"].replace(" ", "").lower()
147
+ if "display:none" in style:
148
+ tag.extract()
149
+
150
+ return html.get_text(strip=True)
151
+
152
+
153
+ def parse_order_flight_table_passenger_info(raw: Tag, headers: OrderedDict[str, str]) -> Dict[str, Any]:
154
+ """
155
+ 解析包含 copyFn(...) 的乘客信息 TD,返回 dict
156
+ """
157
+ # -------------------------------
158
+ # 1. 解析 <img onclick="copyFn('吴世勇||身份证|140104194702241336|男||1947/2/24 0:00:00||1900/1/1 0:00:00')" src="/images/nav_assistant.png" style="width:20px; height:20px; margin:0px;"/>
159
+ # -------------------------------
160
+ img = raw.find("img", onclick=True)
161
+ if not img:
162
+ return {}
163
+
164
+ # 提取里面的参数内容
165
+ onclick = img["onclick"]
166
+ m = re.search(r"copyFn\('(.+?)'\)", onclick)
167
+ if not m:
168
+ return {}
169
+
170
+ group = m.group(1) # "吴世勇||身份证|140104194702241336|男||1947/2/24 0:00:00||1900/1/1 0:00:00"
171
+ parts = group.split("|")
172
+
173
+ # 字段参考:
174
+ # 0: 姓名
175
+ # 1: 空
176
+ # 2: 证件类型
177
+ # 3: 证件号
178
+ # 4: 性别
179
+ # 5: 空
180
+ # 6: 出生日期
181
+ # 7: 空
182
+ # 8: 证件有效期(可能无)
183
+ name = parts[0]
184
+ id_type = parts[2]
185
+ id_no_raw = parts[3]
186
+ sex = parts[4]
187
+ birth_raw = parts[6]
188
+ id_valid_raw = parts[8] if len(parts) > 8 else ""
189
+
190
+ # -------------------------------
191
+ # 2. 证件号加掩码(如页面显示一致)
192
+ # -------------------------------
193
+ # if len(id_no_raw) >= 8:
194
+ # id_no_masked = id_no_raw[:6] + "****" + id_no_raw[-4:]
195
+ # else:
196
+ # id_no_masked = id_no_raw
197
+
198
+ # -------------------------------
199
+ # 3. 出生日期格式化 1947/2/24 → 1947-02-24
200
+ # -------------------------------
201
+ def fmt_date(value: str) -> str:
202
+ try:
203
+ dt = datetime.strptime(value.split(" ")[0], "%Y/%m/%d")
204
+ return dt.strftime("%Y-%m-%d")
205
+ except (Exception,):
206
+ return ""
207
+
208
+ birth = fmt_date(birth_raw)
209
+ id_valid = fmt_date(id_valid_raw)
210
+
211
+ # -------------------------------
212
+ # 4. 年龄计算
213
+ # -------------------------------
214
+ def calc_age(birth_str: str) -> Optional[int]:
215
+ try:
216
+ birthday = datetime.strptime(birth_str, "%Y-%m-%d")
217
+ today = datetime.today()
218
+ return today.year - birthday.year - ((today.month, today.day) < (birthday.month, birthday.day))
219
+ except (Exception,):
220
+ return None
221
+
222
+ age = calc_age(birth)
223
+
224
+ # -------------------------------
225
+ # 5. 类型(成人/儿童)需要从页面中取
226
+ # -------------------------------
227
+ type_span = raw.find(name="span", text=re.compile(r"(成人|儿童|婴儿)"))
228
+ ptype = ""
229
+ if type_span:
230
+ ptype = type_span.get_text(strip=True).replace("【", "").replace("】", "")
231
+
232
+ # -------------------------------
233
+ # 6. 国籍、签发国 从 name="guobie" 的两个 span 获取
234
+ # -------------------------------
235
+ guobies = raw.find_all("span", attrs={"name": "guobie"})
236
+ nationality = guobies[0].get_text(strip=True) if len(guobies) > 0 else ""
237
+ issue_country = guobies[1].get_text(strip=True) if len(guobies) > 1 else ""
238
+
239
+ return {
240
+ get_key_by_index(index=0, ordered_dict=headers): name, # 姓名
241
+ get_key_by_index(index=1, ordered_dict=headers): ptype, # 类型: 成人/儿童
242
+ get_key_by_index(index=2, ordered_dict=headers): id_type, # 证件类型
243
+ get_key_by_index(index=3, ordered_dict=headers): id_no_raw, # 身份证
244
+ get_key_by_index(index=4, ordered_dict=headers): birth, # 出生年月
245
+ get_key_by_index(index=5, ordered_dict=headers): age, # 年龄
246
+ get_key_by_index(index=6, ordered_dict=headers): sex, # 性别
247
+ get_key_by_index(index=7, ordered_dict=headers): nationality, # 国籍
248
+ get_key_by_index(index=8, ordered_dict=headers): issue_country, # 签发国
249
+ get_key_by_index(index=9, ordered_dict=headers): id_valid # 证件有效期
250
+ }
251
+
252
+
253
+ def parse_order_flight_table_row(
254
+ tr: Tag, headers: OrderedDict, extend_headers: OrderedDict
255
+ ) -> Dict[str, Any]:
256
+ """解析航班表每一行的数据"""
257
+ tds = tr.find_all("td", recursive=False)
258
+ values = {}
259
+
260
+ for idx, td in enumerate(tds):
261
+ if idx >= len(headers):
262
+ continue
263
+
264
+ key = get_key_by_index(ordered_dict=headers, index=idx)
265
+ value = get_value_by_index(ordered_dict=headers, index=idx)
266
+ # 如果是 “乘客信息” → 使用结构化解析
267
+ if "乘客" in value or "乘客信息" in value:
268
+ passenger_info = parse_order_flight_table_passenger_info(raw=td, headers=extend_headers)
269
+ values.update(passenger_info)
270
+ else:
271
+ raw = clean_order_flight_table(html=td)
272
+ if "行程" in value:
273
+ code_dep_key = get_key_by_index(index=10, ordered_dict=extend_headers)
274
+ code_arr_key = get_key_by_index(index=11, ordered_dict=extend_headers)
275
+ raw_slice = raw.split("-")
276
+ if len(raw_slice) == 2:
277
+ values[code_dep_key] = raw_slice[0]
278
+ values[code_arr_key] = raw_slice[-1]
279
+ elif "PNR" in value:
280
+ values[key] = raw
281
+ else:
282
+ values[key] = safe_convert_advanced(raw)
283
+
284
+ return values
285
+
286
+
287
+ def extract_structured_table_data(table: Tag) -> List[Optional[Dict[str, Any]]]:
288
+ """提取结构化的表格数据"""
289
+ # headers = parse_order_flight_table_headers(html=table)
290
+ headers = flight_static_headers()
291
+ extend = flight_extend_headers()
292
+ rows = list()
293
+ for tr in table.find_all("tr")[1:]: # 跳过表头
294
+ rows.append(parse_order_flight_table_row(tr=tr, headers=headers, extend_headers=extend))
295
+
296
+ return rows
297
+
298
+
299
+ def parser_order_flight_table(html: str) -> List[Optional[Dict[str, Any]]]:
300
+ """解析航班表"""
301
+ soup = BeautifulSoup(html, 'html.parser')
302
+ # 三个主要的order_sort div
303
+ order_sections = soup.find_all('div', class_='order_sort')
304
+ section = order_sections[3] if len(order_sections) > 3 else Tag(name="")
305
+ results = list()
306
+
307
+ tables = section.find_all('table', class_='table table_border table_center')
308
+ for table in tables:
309
+ table_data = extract_structured_table_data(table)
310
+ if table_data:
311
+ results.extend(table_data)
312
+
313
+ return results