@youhaozhao/cninfo-mcp 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -54
- package/package.json +17 -4
- package/python/mcp_server.py +193 -268
- package/python/spider.py +351 -260
package/python/spider.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
从巨潮资讯查询和下载上市公司定期报告、招股书
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import datetime
|
|
@@ -32,6 +32,101 @@ QUERY_URL = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
|
|
|
32
32
|
MAX_RETRIES = 3
|
|
33
33
|
RETRY_BACKOFF = 1.0
|
|
34
34
|
|
|
35
|
+
|
|
36
|
+
REPORT_TYPE_ALIASES = {
|
|
37
|
+
"annual": "annual",
|
|
38
|
+
"annual_report": "annual",
|
|
39
|
+
"yearly": "annual",
|
|
40
|
+
"ndbg": "annual",
|
|
41
|
+
"年报": "annual",
|
|
42
|
+
"年度报告": "annual",
|
|
43
|
+
"semiannual": "semiannual",
|
|
44
|
+
"semi_annual": "semiannual",
|
|
45
|
+
"half_year": "semiannual",
|
|
46
|
+
"half-year": "semiannual",
|
|
47
|
+
"bndbg": "semiannual",
|
|
48
|
+
"半年度报告": "semiannual",
|
|
49
|
+
"半年报": "semiannual",
|
|
50
|
+
"中报": "semiannual",
|
|
51
|
+
"q1": "q1",
|
|
52
|
+
"first_quarter": "q1",
|
|
53
|
+
"yjdbg": "q1",
|
|
54
|
+
"一季报": "q1",
|
|
55
|
+
"第一季度报告": "q1",
|
|
56
|
+
"q3": "q3",
|
|
57
|
+
"third_quarter": "q3",
|
|
58
|
+
"sjdbg": "q3",
|
|
59
|
+
"三季报": "q3",
|
|
60
|
+
"第三季度报告": "q3",
|
|
61
|
+
"prospectus": "prospectus",
|
|
62
|
+
"ipo": "prospectus",
|
|
63
|
+
"招股书": "prospectus",
|
|
64
|
+
"招股说明书": "prospectus",
|
|
65
|
+
"招股意向书": "prospectus",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
REPORT_TYPE_SPECS = {
|
|
70
|
+
"annual": {
|
|
71
|
+
"label": "年度报告",
|
|
72
|
+
"category": "category_ndbg_szsh",
|
|
73
|
+
"patterns": [
|
|
74
|
+
r".*{year}年年度报告{suffix}",
|
|
75
|
+
r".*{year}年度报告{suffix}",
|
|
76
|
+
r".*{year}年报{suffix}",
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
"semiannual": {
|
|
80
|
+
"label": "半年度报告",
|
|
81
|
+
"category": "category_bndbg_szsh",
|
|
82
|
+
"patterns": [
|
|
83
|
+
r".*{year}年半年度报告{suffix}",
|
|
84
|
+
r".*{year}半年度报告{suffix}",
|
|
85
|
+
r".*{year}年中期报告{suffix}",
|
|
86
|
+
],
|
|
87
|
+
},
|
|
88
|
+
"q1": {
|
|
89
|
+
"label": "第一季度报告",
|
|
90
|
+
"category": "category_yjdbg_szsh",
|
|
91
|
+
"patterns": [
|
|
92
|
+
r".*{year}年第一季度报告{suffix}",
|
|
93
|
+
r".*{year}第一季度报告{suffix}",
|
|
94
|
+
r".*{year}年一季度报告{suffix}",
|
|
95
|
+
r".*{year}一季度报告{suffix}",
|
|
96
|
+
],
|
|
97
|
+
},
|
|
98
|
+
"q3": {
|
|
99
|
+
"label": "第三季度报告",
|
|
100
|
+
"category": "category_sjdbg_szsh",
|
|
101
|
+
"patterns": [
|
|
102
|
+
r".*{year}年第三季度报告{suffix}",
|
|
103
|
+
r".*{year}第三季度报告{suffix}",
|
|
104
|
+
r".*{year}年三季度报告{suffix}",
|
|
105
|
+
r".*{year}三季度报告{suffix}",
|
|
106
|
+
],
|
|
107
|
+
},
|
|
108
|
+
"prospectus": {
|
|
109
|
+
"label": "招股书",
|
|
110
|
+
"category": "",
|
|
111
|
+
"keywords": ["招股书", "招股说明书", "招股意向书"],
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
COMMON_EXCLUDE_KEYWORDS = [
|
|
117
|
+
"摘要",
|
|
118
|
+
"确认意见",
|
|
119
|
+
"取消",
|
|
120
|
+
"更正",
|
|
121
|
+
"补充",
|
|
122
|
+
"说明",
|
|
123
|
+
"提示",
|
|
124
|
+
"致歉",
|
|
125
|
+
"修订",
|
|
126
|
+
"英文",
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
|
|
35
130
|
User_Agent = [
|
|
36
131
|
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
|
|
37
132
|
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
|
|
@@ -55,6 +150,23 @@ BASE_HEADERS = {
|
|
|
55
150
|
}
|
|
56
151
|
|
|
57
152
|
|
|
153
|
+
def supported_report_types() -> dict:
|
|
154
|
+
"""返回当前支持的报告类型。"""
|
|
155
|
+
return {key: spec["label"] for key, spec in REPORT_TYPE_SPECS.items()}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def normalize_report_type(report_type: Optional[str]) -> str:
|
|
159
|
+
"""把英文/中文别名规范化为内部报告类型。"""
|
|
160
|
+
key = str(report_type or "annual").strip().lower().replace(" ", "_")
|
|
161
|
+
normalized = REPORT_TYPE_ALIASES.get(key)
|
|
162
|
+
if normalized is None:
|
|
163
|
+
supported = ", ".join(supported_report_types().keys())
|
|
164
|
+
raise ValueError(
|
|
165
|
+
f"Unsupported report_type '{report_type}'. Supported: {supported}"
|
|
166
|
+
)
|
|
167
|
+
return normalized
|
|
168
|
+
|
|
169
|
+
|
|
58
170
|
def _build_headers() -> dict:
|
|
59
171
|
"""构造请求头,避免在并发场景下修改全局字典。"""
|
|
60
172
|
headers = BASE_HEADERS.copy()
|
|
@@ -74,9 +186,7 @@ def _post_json(url: str, data: dict) -> dict:
|
|
|
74
186
|
last_exc = None
|
|
75
187
|
for attempt in range(MAX_RETRIES):
|
|
76
188
|
try:
|
|
77
|
-
resp = requests.post(
|
|
78
|
-
url, headers=_build_headers(), data=data, timeout=30
|
|
79
|
-
)
|
|
189
|
+
resp = requests.post(url, headers=_build_headers(), data=data, timeout=30)
|
|
80
190
|
resp.raise_for_status()
|
|
81
191
|
return resp.json()
|
|
82
192
|
except requests.exceptions.HTTPError as e:
|
|
@@ -161,349 +271,330 @@ def _paginate(fetch_fn, stock):
|
|
|
161
271
|
if not items:
|
|
162
272
|
break
|
|
163
273
|
all_items.extend(items)
|
|
164
|
-
if len(items) < PAGE_SIZE:
|
|
274
|
+
if len(items) < PAGE_SIZE:
|
|
165
275
|
break
|
|
166
276
|
else:
|
|
167
277
|
logger.warning("翻页达到上限 %s,结果可能被截断(%s)", MAX_PAGES, stock)
|
|
168
278
|
return all_items
|
|
169
279
|
|
|
170
280
|
|
|
171
|
-
def
|
|
172
|
-
|
|
173
|
-
) -> bool:
|
|
174
|
-
"""
|
|
175
|
-
判断标题是否为“年度报告正文”。
|
|
281
|
+
def _compact_title(title: str) -> str:
|
|
282
|
+
return re.sub(r"\s+", "", title or "")
|
|
176
283
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
"
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
"
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
284
|
+
|
|
285
|
+
def _is_report_title(
|
|
286
|
+
title: str,
|
|
287
|
+
report_type: str,
|
|
288
|
+
year_filter: Optional[Union[int, str]] = None,
|
|
289
|
+
) -> bool:
|
|
290
|
+
"""判断标题是否为指定报告类型的正文。"""
|
|
291
|
+
compact_title = _compact_title(title)
|
|
292
|
+
normalized_type = normalize_report_type(report_type)
|
|
293
|
+
spec = REPORT_TYPE_SPECS[normalized_type]
|
|
294
|
+
|
|
295
|
+
if normalized_type == "prospectus":
|
|
296
|
+
matched = next((kw for kw in spec["keywords"] if kw in compact_title), None)
|
|
297
|
+
if matched is None:
|
|
298
|
+
return False
|
|
299
|
+
# 去掉招股书正式名称后再判断摘要/更正等变体,避免“招股说明书”自带的
|
|
300
|
+
# “说明”被 COMMON_EXCLUDE_KEYWORDS 误伤(参见 #2)。
|
|
301
|
+
remainder = compact_title.replace(matched, "")
|
|
302
|
+
return not any(kw in remainder for kw in COMMON_EXCLUDE_KEYWORDS)
|
|
303
|
+
|
|
304
|
+
# 摘要/更正/修订等非正文变体应排除
|
|
305
|
+
if any(keyword in compact_title for keyword in COMMON_EXCLUDE_KEYWORDS):
|
|
198
306
|
return False
|
|
199
307
|
|
|
200
308
|
year_expr = re.escape(str(year_filter)) if year_filter is not None else r"\d{4}"
|
|
201
309
|
suffix_expr = r"(?:[((]更新后[))])?"
|
|
202
310
|
patterns = [
|
|
203
|
-
|
|
204
|
-
|
|
311
|
+
pattern.format(year=year_expr, suffix=suffix_expr)
|
|
312
|
+
for pattern in spec["patterns"]
|
|
205
313
|
]
|
|
206
|
-
if year_filter is not None:
|
|
207
|
-
patterns.append(rf".*{year_expr}年报{suffix_expr}")
|
|
208
|
-
|
|
209
314
|
return any(re.fullmatch(pattern, compact_title) for pattern in patterns)
|
|
210
315
|
|
|
211
316
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
317
|
+
def _is_annual_report_title(
|
|
318
|
+
title: str, year_filter: Optional[Union[int, str]] = None
|
|
319
|
+
) -> bool:
|
|
320
|
+
"""兼容旧调用:判断标题是否为年度报告正文。"""
|
|
321
|
+
return _is_report_title(title, "annual", year_filter=year_filter)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _matches_year(
|
|
325
|
+
announcement: dict, report_type: str, year: Optional[Union[int, str]]
|
|
326
|
+
):
|
|
327
|
+
if year is None:
|
|
328
|
+
return True
|
|
329
|
+
normalized_type = normalize_report_type(report_type)
|
|
330
|
+
if normalized_type == "prospectus":
|
|
331
|
+
announcement_time = announcement.get("announcementTime", "")
|
|
332
|
+
# announcementTime 通常是 "YYYY-MM-DD" 字符串;个别接口可能返回 epoch 毫秒
|
|
333
|
+
if isinstance(announcement_time, (int, float)):
|
|
334
|
+
announcement_time = datetime.datetime.fromtimestamp(
|
|
335
|
+
announcement_time / 1000
|
|
336
|
+
).strftime("%Y-%m-%d")
|
|
337
|
+
return str(announcement_time).startswith(str(year))
|
|
338
|
+
return _is_report_title(
|
|
339
|
+
announcement.get("announcementTitle", ""), normalized_type, year_filter=year
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _build_report_query(
|
|
344
|
+
page: int,
|
|
345
|
+
stock_code: str,
|
|
346
|
+
report_type: str,
|
|
347
|
+
column: str,
|
|
348
|
+
plate: str,
|
|
349
|
+
stock_value: str = "",
|
|
350
|
+
) -> dict:
|
|
351
|
+
normalized_type = normalize_report_type(report_type)
|
|
352
|
+
spec = REPORT_TYPE_SPECS[normalized_type]
|
|
353
|
+
|
|
354
|
+
if normalized_type == "prospectus":
|
|
355
|
+
searchkey = "招股" if stock_value else f"{stock_code} 招股"
|
|
356
|
+
else:
|
|
357
|
+
searchkey = "" if stock_value else stock_code
|
|
358
|
+
|
|
359
|
+
return {
|
|
360
|
+
"pageNum": page,
|
|
216
361
|
"pageSize": PAGE_SIZE,
|
|
217
362
|
"tabName": "fulltext",
|
|
218
|
-
"column":
|
|
219
|
-
"stock":
|
|
220
|
-
"searchkey":
|
|
363
|
+
"column": column,
|
|
364
|
+
"stock": stock_value,
|
|
365
|
+
"searchkey": searchkey,
|
|
221
366
|
"secid": "",
|
|
222
|
-
"plate":
|
|
223
|
-
"category": "
|
|
367
|
+
"plate": plate,
|
|
368
|
+
"category": spec["category"],
|
|
224
369
|
"trade": "",
|
|
225
|
-
"seDate": _date_range(EARLIEST_DATE),
|
|
370
|
+
"seDate": _date_range(EARLIEST_DATE),
|
|
226
371
|
}
|
|
227
372
|
|
|
373
|
+
|
|
374
|
+
def _query_exchange_report(
|
|
375
|
+
page: int,
|
|
376
|
+
stock_code: str,
|
|
377
|
+
report_type: str,
|
|
378
|
+
column: str,
|
|
379
|
+
plate: str,
|
|
380
|
+
stock_value: str = "",
|
|
381
|
+
) -> list:
|
|
382
|
+
query = _build_report_query(
|
|
383
|
+
page=page,
|
|
384
|
+
stock_code=stock_code,
|
|
385
|
+
report_type=report_type,
|
|
386
|
+
column=column,
|
|
387
|
+
plate=plate,
|
|
388
|
+
stock_value=stock_value,
|
|
389
|
+
)
|
|
228
390
|
return _query_announcements(query)
|
|
229
391
|
|
|
230
392
|
|
|
393
|
+
def _sanitize_filename(name: str) -> str:
|
|
394
|
+
return re.sub(r'[\\/:*?"<>|]', "", name).strip()
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# 深市 年度报告
|
|
398
|
+
def szseAnnual(page, stock):
|
|
399
|
+
return _query_exchange_report(page, stock, "annual", "szse", "sz")
|
|
400
|
+
|
|
401
|
+
|
|
231
402
|
# 沪市 年度报告
|
|
232
403
|
def sseAnnual(page, stock):
|
|
233
|
-
|
|
234
|
-
"pageNum": page, # 页码
|
|
235
|
-
"pageSize": PAGE_SIZE,
|
|
236
|
-
"tabName": "fulltext",
|
|
237
|
-
"column": "sse",
|
|
238
|
-
"stock": "",
|
|
239
|
-
"searchkey": stock, # 使用searchkey查询股票代码或公司名
|
|
240
|
-
"secid": "",
|
|
241
|
-
"plate": "sh",
|
|
242
|
-
"category": "category_ndbg_szsh", # 年度报告
|
|
243
|
-
"trade": "",
|
|
244
|
-
"seDate": _date_range(EARLIEST_DATE), # 时间区间
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
return _query_announcements(query)
|
|
404
|
+
return _query_exchange_report(page, stock, "annual", "sse", "sh")
|
|
248
405
|
|
|
249
406
|
|
|
250
407
|
# 北交所 年度报告
|
|
251
408
|
def bseAnnual(page, stock):
|
|
252
|
-
"""
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
通过 plate=bj + stock="代码,orgId" 查询,searchkey/裸代码均返回空。
|
|
256
|
-
"""
|
|
257
|
-
query = {
|
|
258
|
-
"pageNum": page, # 页码
|
|
259
|
-
"pageSize": PAGE_SIZE,
|
|
260
|
-
"tabName": "fulltext",
|
|
261
|
-
"column": "bj", # 北交所
|
|
262
|
-
"stock": stock, # 必须为 "代码,orgId"
|
|
263
|
-
"searchkey": "",
|
|
264
|
-
"secid": "",
|
|
265
|
-
"plate": "bj",
|
|
266
|
-
"category": "category_ndbg_szsh", # 年度报告
|
|
267
|
-
"trade": "",
|
|
268
|
-
"seDate": _date_range(EARLIEST_DATE), # 时间区间
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
return _query_announcements(query)
|
|
409
|
+
"""北交所年报查询,stock 形如 "代码,orgId"。"""
|
|
410
|
+
code = str(stock).split(",", 1)[0]
|
|
411
|
+
return _query_exchange_report(page, code, "annual", "bj", "bj", stock_value=stock)
|
|
272
412
|
|
|
273
413
|
|
|
274
414
|
# 深市 招股
|
|
275
415
|
def szseStock(page, stock):
|
|
276
|
-
|
|
277
|
-
"pageNum": page, # 页码
|
|
278
|
-
"pageSize": PAGE_SIZE,
|
|
279
|
-
"tabName": "fulltext",
|
|
280
|
-
"column": "szse",
|
|
281
|
-
"stock": "",
|
|
282
|
-
"searchkey": stock + " 招股", # 组合搜索:股票代码 + 招股
|
|
283
|
-
"secid": "",
|
|
284
|
-
"plate": "sz",
|
|
285
|
-
"category": "",
|
|
286
|
-
"trade": "",
|
|
287
|
-
"seDate": _date_range(EARLIEST_DATE), # 时间区间
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
return _query_announcements(query)
|
|
416
|
+
return _query_exchange_report(page, stock, "prospectus", "szse", "sz")
|
|
291
417
|
|
|
292
418
|
|
|
293
419
|
# 沪市 招股
|
|
294
420
|
def sseStock(page, stock):
|
|
295
|
-
|
|
296
|
-
"pageNum": page, # 页码
|
|
297
|
-
"pageSize": PAGE_SIZE,
|
|
298
|
-
"tabName": "fulltext",
|
|
299
|
-
"column": "sse",
|
|
300
|
-
"stock": "",
|
|
301
|
-
"searchkey": stock + " 招股", # 组合搜索:股票代码 + 招股
|
|
302
|
-
"secid": "",
|
|
303
|
-
"plate": "sh",
|
|
304
|
-
"category": "",
|
|
305
|
-
"trade": "",
|
|
306
|
-
"seDate": _date_range(EARLIEST_DATE), # 时间区间
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
return _query_announcements(query)
|
|
421
|
+
return _query_exchange_report(page, stock, "prospectus", "sse", "sh")
|
|
310
422
|
|
|
311
423
|
|
|
312
424
|
def Download(
|
|
313
425
|
single_page,
|
|
426
|
+
report_type: Optional[str] = None,
|
|
314
427
|
year_filter: Optional[Union[int, str]] = None,
|
|
315
428
|
save_path: Optional[str] = None,
|
|
316
429
|
):
|
|
317
|
-
"""下载公告列表中的 PDF
|
|
430
|
+
"""下载公告列表中的 PDF 文件。"""
|
|
318
431
|
if single_page is None:
|
|
319
|
-
return
|
|
432
|
+
return 0
|
|
320
433
|
|
|
321
|
-
|
|
322
|
-
"招股书",
|
|
323
|
-
"招股说明书",
|
|
324
|
-
"招股意向书",
|
|
325
|
-
]
|
|
326
|
-
|
|
327
|
-
output_dir = (save_path or saving_path).rstrip("/") + "/"
|
|
434
|
+
output_dir = (save_path or saving_path).rstrip("/\\") + "/"
|
|
328
435
|
downloaded_count = 0
|
|
436
|
+
normalized_type = normalize_report_type(report_type) if report_type else None
|
|
329
437
|
|
|
330
438
|
for i in single_page:
|
|
331
|
-
title = i
|
|
439
|
+
title = i.get("announcementTitle", "")
|
|
440
|
+
if normalized_type:
|
|
441
|
+
should_download = _is_report_title(
|
|
442
|
+
title, normalized_type, year_filter=year_filter
|
|
443
|
+
)
|
|
444
|
+
else:
|
|
445
|
+
should_download = any(
|
|
446
|
+
_is_report_title(title, candidate, year_filter=year_filter)
|
|
447
|
+
for candidate in REPORT_TYPE_SPECS
|
|
448
|
+
)
|
|
332
449
|
|
|
333
|
-
|
|
334
|
-
if "确认意见" in title or "取消" in title or "摘要" in title:
|
|
450
|
+
if not should_download:
|
|
335
451
|
continue
|
|
336
452
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
is_prospectus = any(item in title for item in allowed_list_2)
|
|
342
|
-
|
|
343
|
-
if is_annual_report or is_prospectus:
|
|
344
|
-
download = download_path + i["adjunctUrl"]
|
|
345
|
-
name = (
|
|
346
|
-
i["secCode"]
|
|
347
|
-
+ "_"
|
|
348
|
-
+ i["secName"]
|
|
349
|
-
+ "_"
|
|
350
|
-
+ i["announcementTitle"]
|
|
351
|
-
+ ".pdf"
|
|
352
|
-
)
|
|
353
|
-
if "*" in name:
|
|
354
|
-
name = name.replace("*", "")
|
|
355
|
-
file_path = output_dir + name
|
|
453
|
+
adjunct_url = i.get("adjunctUrl", "")
|
|
454
|
+
if not adjunct_url:
|
|
455
|
+
logger.warning("公告缺少 adjunctUrl,跳过:%s", title)
|
|
456
|
+
continue
|
|
356
457
|
|
|
357
|
-
|
|
358
|
-
|
|
458
|
+
download = download_path + adjunct_url
|
|
459
|
+
name = _sanitize_filename(
|
|
460
|
+
i.get("secCode", "") + "_" + i.get("secName", "") + "_" + title + ".pdf"
|
|
461
|
+
)
|
|
462
|
+
file_path = output_dir + name
|
|
359
463
|
|
|
360
|
-
|
|
361
|
-
|
|
464
|
+
logger.info("↓ %s", name)
|
|
465
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
362
466
|
|
|
363
|
-
|
|
467
|
+
time.sleep(random.random() * 2)
|
|
364
468
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
else:
|
|
373
|
-
continue
|
|
469
|
+
r = requests.get(
|
|
470
|
+
download, headers={"User-Agent": random.choice(User_Agent)}, timeout=30
|
|
471
|
+
)
|
|
472
|
+
r.raise_for_status()
|
|
473
|
+
with open(file_path, "wb") as f:
|
|
474
|
+
f.write(r.content)
|
|
475
|
+
downloaded_count += 1
|
|
374
476
|
|
|
375
477
|
return downloaded_count
|
|
376
478
|
|
|
377
479
|
|
|
378
|
-
def
|
|
379
|
-
"""
|
|
480
|
+
def query_reports(stock_code, report_type="annual", year=None):
|
|
481
|
+
"""查询指定股票和报告类型的公告列表。"""
|
|
482
|
+
normalized_type = normalize_report_type(report_type)
|
|
380
483
|
all_announcements = []
|
|
484
|
+
requested_code = re.sub(r"\D", "", str(stock_code or ""))
|
|
485
|
+
allowed_sec_codes = {requested_code} if requested_code else set()
|
|
381
486
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
except Exception as e:
|
|
386
|
-
logger.warning("沪市招股书查询失败: %s", e)
|
|
387
|
-
|
|
388
|
-
try:
|
|
389
|
-
announcements_szse = _paginate(szseStock, stock_code)
|
|
390
|
-
all_announcements.extend(announcements_szse)
|
|
391
|
-
except Exception as e:
|
|
392
|
-
logger.warning("深市招股书查询失败: %s", e)
|
|
393
|
-
|
|
394
|
-
prospectus_keywords = ["招股书", "招股说明书", "招股意向书"]
|
|
395
|
-
filtered = [
|
|
396
|
-
a
|
|
397
|
-
for a in all_announcements
|
|
398
|
-
if any(kw in a.get("announcementTitle", "") for kw in prospectus_keywords)
|
|
487
|
+
exchanges = [
|
|
488
|
+
("sse", "sh", "沪市"),
|
|
489
|
+
("szse", "sz", "深市"),
|
|
399
490
|
]
|
|
491
|
+
for column, plate, label in exchanges:
|
|
492
|
+
try:
|
|
493
|
+
fetch_fn = lambda page, _stock, c=column, p=plate: _query_exchange_report(
|
|
494
|
+
page, stock_code, normalized_type, c, p
|
|
495
|
+
)
|
|
496
|
+
all_announcements.extend(_paginate(fetch_fn, stock_code))
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.warning(
|
|
499
|
+
"%s%s查询失败: %s",
|
|
500
|
+
label,
|
|
501
|
+
REPORT_TYPE_SPECS[normalized_type]["label"],
|
|
502
|
+
e,
|
|
503
|
+
)
|
|
400
504
|
|
|
401
|
-
return filtered
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def download_prospectus(stock_code, save_path=None):
|
|
405
|
-
"""下载指定股票的招股书"""
|
|
406
|
-
announcements = query_prospectus(stock_code)
|
|
407
|
-
|
|
408
|
-
if not announcements:
|
|
409
|
-
return {
|
|
410
|
-
"success": False,
|
|
411
|
-
"message": f"未找到股票 {stock_code} 的招股书",
|
|
412
|
-
"downloaded": 0,
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
output_dir = save_path or saving_path
|
|
416
|
-
count = Download(announcements, save_path=output_dir)
|
|
417
|
-
|
|
418
|
-
downloaded = count or 0
|
|
419
|
-
return {
|
|
420
|
-
"success": downloaded > 0,
|
|
421
|
-
"message": f"已下载 {stock_code} 招股书,共 {downloaded} 个文件"
|
|
422
|
-
if downloaded > 0
|
|
423
|
-
else f"未下载任何文件({stock_code} 招股书)",
|
|
424
|
-
"downloaded": downloaded,
|
|
425
|
-
"path": output_dir,
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
def query_annual_reports(stock_code, year=None):
|
|
430
|
-
"""查询指定股票的年度报告列表"""
|
|
431
|
-
all_announcements = []
|
|
432
|
-
|
|
433
|
-
# 查询沪市
|
|
434
|
-
try:
|
|
435
|
-
announcements_sse = _paginate(sseAnnual, stock_code)
|
|
436
|
-
all_announcements.extend(announcements_sse)
|
|
437
|
-
except Exception as e:
|
|
438
|
-
logger.warning("沪市年报查询失败: %s", e)
|
|
439
|
-
|
|
440
|
-
# 查询深市
|
|
441
|
-
try:
|
|
442
|
-
announcements_szse = _paginate(szseAnnual, stock_code)
|
|
443
|
-
all_announcements.extend(announcements_szse)
|
|
444
|
-
except Exception as e:
|
|
445
|
-
logger.warning("深市年报查询失败: %s", e)
|
|
446
|
-
|
|
447
|
-
# 查询北交所(代码以 4/8/9 开头)。北交所接口必须用 orgId,
|
|
448
|
-
# 故先解析 orgId 再以 stock="代码,orgId" 翻页查询。
|
|
449
505
|
if _is_bse_code(stock_code):
|
|
450
506
|
try:
|
|
451
507
|
resolved = _resolve_org_id(stock_code)
|
|
452
508
|
if resolved:
|
|
453
509
|
code, org_id = resolved
|
|
454
|
-
|
|
455
|
-
|
|
510
|
+
allowed_sec_codes.add(code)
|
|
511
|
+
stock_value = f"{code},{org_id}"
|
|
512
|
+
fetch_fn = lambda page, _stock: _query_exchange_report(
|
|
513
|
+
page,
|
|
514
|
+
code,
|
|
515
|
+
normalized_type,
|
|
516
|
+
"bj",
|
|
517
|
+
"bj",
|
|
518
|
+
stock_value=stock_value,
|
|
519
|
+
)
|
|
520
|
+
all_announcements.extend(_paginate(fetch_fn, stock_value))
|
|
456
521
|
except Exception as e:
|
|
457
|
-
logger.warning(
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
year_patterns = [
|
|
463
|
-
rf"{year_expr}年年度报告",
|
|
464
|
-
rf"{year_expr}年度报告",
|
|
465
|
-
rf"{year_expr}年报",
|
|
466
|
-
]
|
|
467
|
-
filtered = []
|
|
468
|
-
for announcement in all_announcements:
|
|
469
|
-
title = re.sub(r"\s+", "", announcement.get("announcementTitle", ""))
|
|
470
|
-
# 这里故意使用宽松匹配作为“预筛选”以保留候选项。
|
|
471
|
-
# 真正的严格判定(fullmatch + 排除词)在 Download() 的
|
|
472
|
-
# _is_annual_report_title() 中执行,形成两层防线。
|
|
473
|
-
if any(re.search(pattern, title) for pattern in year_patterns):
|
|
474
|
-
filtered.append(announcement)
|
|
475
|
-
all_announcements = filtered
|
|
476
|
-
|
|
477
|
-
return all_announcements
|
|
522
|
+
logger.warning(
|
|
523
|
+
"北交所%s查询失败: %s",
|
|
524
|
+
REPORT_TYPE_SPECS[normalized_type]["label"],
|
|
525
|
+
e,
|
|
526
|
+
)
|
|
478
527
|
|
|
528
|
+
filtered = []
|
|
529
|
+
seen = set()
|
|
530
|
+
for announcement in all_announcements:
|
|
531
|
+
title = announcement.get("announcementTitle", "")
|
|
532
|
+
adjunct_url = announcement.get("adjunctUrl", "")
|
|
533
|
+
sec_code = str(announcement.get("secCode", ""))
|
|
534
|
+
if allowed_sec_codes and sec_code not in allowed_sec_codes:
|
|
535
|
+
continue
|
|
536
|
+
dedupe_key = (announcement.get("secCode"), title, adjunct_url)
|
|
537
|
+
if dedupe_key in seen:
|
|
538
|
+
continue
|
|
539
|
+
seen.add(dedupe_key)
|
|
479
540
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
541
|
+
if not _is_report_title(title, normalized_type, year_filter=year):
|
|
542
|
+
continue
|
|
543
|
+
# 招股书标题不含年份,需按 announcementTime 另行核对;其余类型的年份
|
|
544
|
+
# 已在 _is_report_title 内匹配,无需重复。
|
|
545
|
+
if normalized_type == "prospectus" and not _matches_year(
|
|
546
|
+
announcement, normalized_type, year
|
|
547
|
+
):
|
|
548
|
+
continue
|
|
549
|
+
filtered.append(announcement)
|
|
550
|
+
|
|
551
|
+
return filtered
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def download_reports(stock_code, report_type="annual", year=None, save_path=None):
|
|
555
|
+
"""下载指定股票和报告类型的 PDF。"""
|
|
556
|
+
normalized_type = normalize_report_type(report_type)
|
|
557
|
+
label = REPORT_TYPE_SPECS[normalized_type]["label"]
|
|
558
|
+
announcements = query_reports(stock_code, normalized_type, year)
|
|
483
559
|
|
|
484
560
|
if not announcements:
|
|
485
561
|
return {
|
|
486
562
|
"success": False,
|
|
487
|
-
"message": f"未找到股票 {stock_code}
|
|
563
|
+
"message": f"未找到股票 {stock_code} 的{label}"
|
|
488
564
|
+ (f"({year} 年)" if year else ""),
|
|
489
565
|
"downloaded": 0,
|
|
490
566
|
}
|
|
491
567
|
|
|
492
568
|
output_dir = save_path or saving_path
|
|
493
|
-
count = Download(
|
|
569
|
+
count = Download(
|
|
570
|
+
announcements,
|
|
571
|
+
report_type=normalized_type,
|
|
572
|
+
year_filter=year,
|
|
573
|
+
save_path=output_dir,
|
|
574
|
+
)
|
|
494
575
|
|
|
495
576
|
downloaded = count or 0
|
|
496
577
|
year_suffix = f"({year} 年)" if year else ""
|
|
497
578
|
return {
|
|
498
579
|
"success": downloaded > 0,
|
|
499
|
-
"message": f"已下载 {stock_code}
|
|
580
|
+
"message": f"已下载 {stock_code} {label}{year_suffix},共 {downloaded} 个文件"
|
|
500
581
|
if downloaded > 0
|
|
501
|
-
else f"未下载任何文件({stock_code}
|
|
582
|
+
else f"未下载任何文件({stock_code} {label}{year_suffix})",
|
|
502
583
|
"downloaded": downloaded,
|
|
503
584
|
"path": output_dir,
|
|
504
585
|
}
|
|
505
586
|
|
|
506
587
|
|
|
588
|
+
def query_annual_reports(stock_code, year=None):
|
|
589
|
+
"""查询指定股票的年度报告列表。"""
|
|
590
|
+
return query_reports(stock_code, "annual", year)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def download_annual_reports(stock_code, year=None, save_path=None):
|
|
594
|
+
"""下载指定股票的年度报告。"""
|
|
595
|
+
return download_reports(stock_code, "annual", year=year, save_path=save_path)
|
|
596
|
+
|
|
597
|
+
|
|
507
598
|
def Run(page_number, stock):
|
|
508
599
|
annual_report = []
|
|
509
600
|
stock_report = []
|
|
@@ -521,10 +612,10 @@ def Run(page_number, stock):
|
|
|
521
612
|
annual_report = szseAnnual(page_number, stock)
|
|
522
613
|
except Exception:
|
|
523
614
|
logger.warning("%s page error", page_number)
|
|
524
|
-
Download(annual_report)
|
|
525
|
-
Download(stock_report)
|
|
526
|
-
Download(annual_report_)
|
|
527
|
-
Download(stock_report_)
|
|
615
|
+
Download(annual_report, report_type="annual")
|
|
616
|
+
Download(stock_report, report_type="prospectus")
|
|
617
|
+
Download(annual_report_, report_type="annual")
|
|
618
|
+
Download(stock_report_, report_type="prospectus")
|
|
528
619
|
|
|
529
620
|
|
|
530
621
|
if __name__ == "__main__":
|
|
@@ -534,6 +625,6 @@ if __name__ == "__main__":
|
|
|
534
625
|
with open("company_id.txt") as file:
|
|
535
626
|
lines = file.readlines()
|
|
536
627
|
for line in lines:
|
|
537
|
-
stock = line
|
|
538
|
-
Run(1,
|
|
539
|
-
logger.info("%s done",
|
|
628
|
+
stock = line.strip()
|
|
629
|
+
Run(1, stock)
|
|
630
|
+
logger.info("%s done", stock)
|