@youhaozhao/cninfo-mcp 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/python/spider.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- 从巨潮资讯下载年度报告和招股书
2
+ 从巨潮资讯查询和下载上市公司定期报告、招股书
3
3
  """
4
4
 
5
5
  import datetime
@@ -32,6 +32,101 @@ QUERY_URL = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
32
32
  MAX_RETRIES = 3
33
33
  RETRY_BACKOFF = 1.0
34
34
 
35
+
36
+ REPORT_TYPE_ALIASES = {
37
+ "annual": "annual",
38
+ "annual_report": "annual",
39
+ "yearly": "annual",
40
+ "ndbg": "annual",
41
+ "年报": "annual",
42
+ "年度报告": "annual",
43
+ "semiannual": "semiannual",
44
+ "semi_annual": "semiannual",
45
+ "half_year": "semiannual",
46
+ "half-year": "semiannual",
47
+ "bndbg": "semiannual",
48
+ "半年度报告": "semiannual",
49
+ "半年报": "semiannual",
50
+ "中报": "semiannual",
51
+ "q1": "q1",
52
+ "first_quarter": "q1",
53
+ "yjdbg": "q1",
54
+ "一季报": "q1",
55
+ "第一季度报告": "q1",
56
+ "q3": "q3",
57
+ "third_quarter": "q3",
58
+ "sjdbg": "q3",
59
+ "三季报": "q3",
60
+ "第三季度报告": "q3",
61
+ "prospectus": "prospectus",
62
+ "ipo": "prospectus",
63
+ "招股书": "prospectus",
64
+ "招股说明书": "prospectus",
65
+ "招股意向书": "prospectus",
66
+ }
67
+
68
+
69
+ REPORT_TYPE_SPECS = {
70
+ "annual": {
71
+ "label": "年度报告",
72
+ "category": "category_ndbg_szsh",
73
+ "patterns": [
74
+ r".*{year}年年度报告{suffix}",
75
+ r".*{year}年度报告{suffix}",
76
+ r".*{year}年报{suffix}",
77
+ ],
78
+ },
79
+ "semiannual": {
80
+ "label": "半年度报告",
81
+ "category": "category_bndbg_szsh",
82
+ "patterns": [
83
+ r".*{year}年半年度报告{suffix}",
84
+ r".*{year}半年度报告{suffix}",
85
+ r".*{year}年中期报告{suffix}",
86
+ ],
87
+ },
88
+ "q1": {
89
+ "label": "第一季度报告",
90
+ "category": "category_yjdbg_szsh",
91
+ "patterns": [
92
+ r".*{year}年第一季度报告{suffix}",
93
+ r".*{year}第一季度报告{suffix}",
94
+ r".*{year}年一季度报告{suffix}",
95
+ r".*{year}一季度报告{suffix}",
96
+ ],
97
+ },
98
+ "q3": {
99
+ "label": "第三季度报告",
100
+ "category": "category_sjdbg_szsh",
101
+ "patterns": [
102
+ r".*{year}年第三季度报告{suffix}",
103
+ r".*{year}第三季度报告{suffix}",
104
+ r".*{year}年三季度报告{suffix}",
105
+ r".*{year}三季度报告{suffix}",
106
+ ],
107
+ },
108
+ "prospectus": {
109
+ "label": "招股书",
110
+ "category": "",
111
+ "keywords": ["招股书", "招股说明书", "招股意向书"],
112
+ },
113
+ }
114
+
115
+
116
+ COMMON_EXCLUDE_KEYWORDS = [
117
+ "摘要",
118
+ "确认意见",
119
+ "取消",
120
+ "更正",
121
+ "补充",
122
+ "说明",
123
+ "提示",
124
+ "致歉",
125
+ "修订",
126
+ "英文",
127
+ ]
128
+
129
+
35
130
  User_Agent = [
36
131
  "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
37
132
  "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
@@ -55,6 +150,23 @@ BASE_HEADERS = {
55
150
  }
56
151
 
57
152
 
153
+ def supported_report_types() -> dict:
154
+ """返回当前支持的报告类型。"""
155
+ return {key: spec["label"] for key, spec in REPORT_TYPE_SPECS.items()}
156
+
157
+
158
+ def normalize_report_type(report_type: Optional[str]) -> str:
159
+ """把英文/中文别名规范化为内部报告类型。"""
160
+ key = str(report_type or "annual").strip().lower().replace(" ", "_")
161
+ normalized = REPORT_TYPE_ALIASES.get(key)
162
+ if normalized is None:
163
+ supported = ", ".join(supported_report_types().keys())
164
+ raise ValueError(
165
+ f"Unsupported report_type '{report_type}'. Supported: {supported}"
166
+ )
167
+ return normalized
168
+
169
+
58
170
  def _build_headers() -> dict:
59
171
  """构造请求头,避免在并发场景下修改全局字典。"""
60
172
  headers = BASE_HEADERS.copy()
@@ -74,9 +186,7 @@ def _post_json(url: str, data: dict) -> dict:
74
186
  last_exc = None
75
187
  for attempt in range(MAX_RETRIES):
76
188
  try:
77
- resp = requests.post(
78
- url, headers=_build_headers(), data=data, timeout=30
79
- )
189
+ resp = requests.post(url, headers=_build_headers(), data=data, timeout=30)
80
190
  resp.raise_for_status()
81
191
  return resp.json()
82
192
  except requests.exceptions.HTTPError as e:
@@ -161,349 +271,330 @@ def _paginate(fetch_fn, stock):
161
271
  if not items:
162
272
  break
163
273
  all_items.extend(items)
164
- if len(items) < PAGE_SIZE: # 不足一页说明已到最后一页
274
+ if len(items) < PAGE_SIZE:
165
275
  break
166
276
  else:
167
277
  logger.warning("翻页达到上限 %s,结果可能被截断(%s)", MAX_PAGES, stock)
168
278
  return all_items
169
279
 
170
280
 
171
- def _is_annual_report_title(
172
- title: str, year_filter: Optional[Union[int, str]] = None
173
- ) -> bool:
174
- """
175
- 判断标题是否为“年度报告正文”。
281
+ def _compact_title(title: str) -> str:
282
+ return re.sub(r"\s+", "", title or "")
176
283
 
177
- 支持常见变体:
178
- - 2024年年度报告
179
- - 2024年度报告
180
- - 2024年报
181
- """
182
- compact_title = re.sub(r"\s+", "", title or "")
183
-
184
- # 非正文公告关键词过滤
185
- exclude_keywords = [
186
- "摘要",
187
- "确认意见",
188
- "取消",
189
- "更正",
190
- "补充",
191
- "说明",
192
- "提示",
193
- "致歉",
194
- "修订",
195
- "英文",
196
- ]
197
- if any(keyword in compact_title for keyword in exclude_keywords):
284
+
285
+ def _is_report_title(
286
+ title: str,
287
+ report_type: str,
288
+ year_filter: Optional[Union[int, str]] = None,
289
+ ) -> bool:
290
+ """判断标题是否为指定报告类型的正文。"""
291
+ compact_title = _compact_title(title)
292
+ normalized_type = normalize_report_type(report_type)
293
+ spec = REPORT_TYPE_SPECS[normalized_type]
294
+
295
+ if normalized_type == "prospectus":
296
+ matched = next((kw for kw in spec["keywords"] if kw in compact_title), None)
297
+ if matched is None:
298
+ return False
299
+ # 去掉招股书正式名称后再判断摘要/更正等变体,避免“招股说明书”自带的
300
+ # “说明”被 COMMON_EXCLUDE_KEYWORDS 误伤(参见 #2)。
301
+ remainder = compact_title.replace(matched, "")
302
+ return not any(kw in remainder for kw in COMMON_EXCLUDE_KEYWORDS)
303
+
304
+ # 摘要/更正/修订等非正文变体应排除
305
+ if any(keyword in compact_title for keyword in COMMON_EXCLUDE_KEYWORDS):
198
306
  return False
199
307
 
200
308
  year_expr = re.escape(str(year_filter)) if year_filter is not None else r"\d{4}"
201
309
  suffix_expr = r"(?:[((]更新后[))])?"
202
310
  patterns = [
203
- rf".*{year_expr}年年度报告{suffix_expr}",
204
- rf".*{year_expr}年度报告{suffix_expr}",
311
+ pattern.format(year=year_expr, suffix=suffix_expr)
312
+ for pattern in spec["patterns"]
205
313
  ]
206
- if year_filter is not None:
207
- patterns.append(rf".*{year_expr}年报{suffix_expr}")
208
-
209
314
  return any(re.fullmatch(pattern, compact_title) for pattern in patterns)
210
315
 
211
316
 
212
- # 深市 年度报告
213
- def szseAnnual(page, stock):
214
- query = {
215
- "pageNum": page, # 页码
317
+ def _is_annual_report_title(
318
+ title: str, year_filter: Optional[Union[int, str]] = None
319
+ ) -> bool:
320
+ """兼容旧调用:判断标题是否为年度报告正文。"""
321
+ return _is_report_title(title, "annual", year_filter=year_filter)
322
+
323
+
324
+ def _matches_year(
325
+ announcement: dict, report_type: str, year: Optional[Union[int, str]]
326
+ ):
327
+ if year is None:
328
+ return True
329
+ normalized_type = normalize_report_type(report_type)
330
+ if normalized_type == "prospectus":
331
+ announcement_time = announcement.get("announcementTime", "")
332
+ # announcementTime 通常是 "YYYY-MM-DD" 字符串;个别接口可能返回 epoch 毫秒
333
+ if isinstance(announcement_time, (int, float)):
334
+ announcement_time = datetime.datetime.fromtimestamp(
335
+ announcement_time / 1000
336
+ ).strftime("%Y-%m-%d")
337
+ return str(announcement_time).startswith(str(year))
338
+ return _is_report_title(
339
+ announcement.get("announcementTitle", ""), normalized_type, year_filter=year
340
+ )
341
+
342
+
343
+ def _build_report_query(
344
+ page: int,
345
+ stock_code: str,
346
+ report_type: str,
347
+ column: str,
348
+ plate: str,
349
+ stock_value: str = "",
350
+ ) -> dict:
351
+ normalized_type = normalize_report_type(report_type)
352
+ spec = REPORT_TYPE_SPECS[normalized_type]
353
+
354
+ if normalized_type == "prospectus":
355
+ searchkey = "招股" if stock_value else f"{stock_code} 招股"
356
+ else:
357
+ searchkey = "" if stock_value else stock_code
358
+
359
+ return {
360
+ "pageNum": page,
216
361
  "pageSize": PAGE_SIZE,
217
362
  "tabName": "fulltext",
218
- "column": "szse", # 深交所
219
- "stock": "",
220
- "searchkey": stock, # 使用searchkey查询股票代码或公司名
363
+ "column": column,
364
+ "stock": stock_value,
365
+ "searchkey": searchkey,
221
366
  "secid": "",
222
- "plate": "sz",
223
- "category": "category_ndbg_szsh", # 年度报告
367
+ "plate": plate,
368
+ "category": spec["category"],
224
369
  "trade": "",
225
- "seDate": _date_range(EARLIEST_DATE), # 时间区间
370
+ "seDate": _date_range(EARLIEST_DATE),
226
371
  }
227
372
 
373
+
374
+ def _query_exchange_report(
375
+ page: int,
376
+ stock_code: str,
377
+ report_type: str,
378
+ column: str,
379
+ plate: str,
380
+ stock_value: str = "",
381
+ ) -> list:
382
+ query = _build_report_query(
383
+ page=page,
384
+ stock_code=stock_code,
385
+ report_type=report_type,
386
+ column=column,
387
+ plate=plate,
388
+ stock_value=stock_value,
389
+ )
228
390
  return _query_announcements(query)
229
391
 
230
392
 
393
+ def _sanitize_filename(name: str) -> str:
394
+ return re.sub(r'[\\/:*?"<>|]', "", name).strip()
395
+
396
+
397
+ # 深市 年度报告
398
+ def szseAnnual(page, stock):
399
+ return _query_exchange_report(page, stock, "annual", "szse", "sz")
400
+
401
+
231
402
  # 沪市 年度报告
232
403
  def sseAnnual(page, stock):
233
- query = {
234
- "pageNum": page, # 页码
235
- "pageSize": PAGE_SIZE,
236
- "tabName": "fulltext",
237
- "column": "sse",
238
- "stock": "",
239
- "searchkey": stock, # 使用searchkey查询股票代码或公司名
240
- "secid": "",
241
- "plate": "sh",
242
- "category": "category_ndbg_szsh", # 年度报告
243
- "trade": "",
244
- "seDate": _date_range(EARLIEST_DATE), # 时间区间
245
- }
246
-
247
- return _query_announcements(query)
404
+ return _query_exchange_report(page, stock, "annual", "sse", "sh")
248
405
 
249
406
 
250
407
  # 北交所 年度报告
251
408
  def bseAnnual(page, stock):
252
- """北交所年报查询。
253
-
254
- stock 形如 "代码,orgId",由 _resolve_org_id 解析得到。北交所必须
255
- 通过 plate=bj + stock="代码,orgId" 查询,searchkey/裸代码均返回空。
256
- """
257
- query = {
258
- "pageNum": page, # 页码
259
- "pageSize": PAGE_SIZE,
260
- "tabName": "fulltext",
261
- "column": "bj", # 北交所
262
- "stock": stock, # 必须为 "代码,orgId"
263
- "searchkey": "",
264
- "secid": "",
265
- "plate": "bj",
266
- "category": "category_ndbg_szsh", # 年度报告
267
- "trade": "",
268
- "seDate": _date_range(EARLIEST_DATE), # 时间区间
269
- }
270
-
271
- return _query_announcements(query)
409
+ """北交所年报查询,stock 形如 "代码,orgId"。"""
410
+ code = str(stock).split(",", 1)[0]
411
+ return _query_exchange_report(page, code, "annual", "bj", "bj", stock_value=stock)
272
412
 
273
413
 
274
414
  # 深市 招股
275
415
  def szseStock(page, stock):
276
- query = {
277
- "pageNum": page, # 页码
278
- "pageSize": PAGE_SIZE,
279
- "tabName": "fulltext",
280
- "column": "szse",
281
- "stock": "",
282
- "searchkey": stock + " 招股", # 组合搜索:股票代码 + 招股
283
- "secid": "",
284
- "plate": "sz",
285
- "category": "",
286
- "trade": "",
287
- "seDate": _date_range(EARLIEST_DATE), # 时间区间
288
- }
289
-
290
- return _query_announcements(query)
416
+ return _query_exchange_report(page, stock, "prospectus", "szse", "sz")
291
417
 
292
418
 
293
419
  # 沪市 招股
294
420
  def sseStock(page, stock):
295
- query = {
296
- "pageNum": page, # 页码
297
- "pageSize": PAGE_SIZE,
298
- "tabName": "fulltext",
299
- "column": "sse",
300
- "stock": "",
301
- "searchkey": stock + " 招股", # 组合搜索:股票代码 + 招股
302
- "secid": "",
303
- "plate": "sh",
304
- "category": "",
305
- "trade": "",
306
- "seDate": _date_range(EARLIEST_DATE), # 时间区间
307
- }
308
-
309
- return _query_announcements(query)
421
+ return _query_exchange_report(page, stock, "prospectus", "sse", "sh")
310
422
 
311
423
 
312
424
  def Download(
313
425
  single_page,
426
+ report_type: Optional[str] = None,
314
427
  year_filter: Optional[Union[int, str]] = None,
315
428
  save_path: Optional[str] = None,
316
429
  ):
317
- """下载公告列表中的 PDF 文件"""
430
+ """下载公告列表中的 PDF 文件。"""
318
431
  if single_page is None:
319
- return
432
+ return 0
320
433
 
321
- allowed_list_2 = [
322
- "招股书",
323
- "招股说明书",
324
- "招股意向书",
325
- ]
326
-
327
- output_dir = (save_path or saving_path).rstrip("/") + "/"
434
+ output_dir = (save_path or saving_path).rstrip("/\\") + "/"
328
435
  downloaded_count = 0
436
+ normalized_type = normalize_report_type(report_type) if report_type else None
329
437
 
330
438
  for i in single_page:
331
- title = i["announcementTitle"]
439
+ title = i.get("announcementTitle", "")
440
+ if normalized_type:
441
+ should_download = _is_report_title(
442
+ title, normalized_type, year_filter=year_filter
443
+ )
444
+ else:
445
+ should_download = any(
446
+ _is_report_title(title, candidate, year_filter=year_filter)
447
+ for candidate in REPORT_TYPE_SPECS
448
+ )
332
449
 
333
- # 跳过确认意见、取消公告、摘要等非正文文件
334
- if "确认意见" in title or "取消" in title or "摘要" in title:
450
+ if not should_download:
335
451
  continue
336
452
 
337
- # 年报标题匹配:支持“2024年年度报告/2024年度报告/2024年报”等变体
338
- is_annual_report = _is_annual_report_title(title, year_filter=year_filter)
339
-
340
- # 检查招股书
341
- is_prospectus = any(item in title for item in allowed_list_2)
342
-
343
- if is_annual_report or is_prospectus:
344
- download = download_path + i["adjunctUrl"]
345
- name = (
346
- i["secCode"]
347
- + "_"
348
- + i["secName"]
349
- + "_"
350
- + i["announcementTitle"]
351
- + ".pdf"
352
- )
353
- if "*" in name:
354
- name = name.replace("*", "")
355
- file_path = output_dir + name
453
+ adjunct_url = i.get("adjunctUrl", "")
454
+ if not adjunct_url:
455
+ logger.warning("公告缺少 adjunctUrl,跳过:%s", title)
456
+ continue
356
457
 
357
- # 显示下载进度
358
- logger.info("↓ %s", name)
458
+ download = download_path + adjunct_url
459
+ name = _sanitize_filename(
460
+ i.get("secCode", "") + "_" + i.get("secName", "") + "_" + title + ".pdf"
461
+ )
462
+ file_path = output_dir + name
359
463
 
360
- # 确保目录存在
361
- os.makedirs(output_dir, exist_ok=True)
464
+ logger.info("↓ %s", name)
465
+ os.makedirs(output_dir, exist_ok=True)
362
466
 
363
- time.sleep(random.random() * 2)
467
+ time.sleep(random.random() * 2)
364
468
 
365
- r = requests.get(
366
- download, headers={"User-Agent": random.choice(User_Agent)}, timeout=30
367
- )
368
- r.raise_for_status()
369
- with open(file_path, "wb") as f:
370
- f.write(r.content)
371
- downloaded_count += 1
372
- else:
373
- continue
469
+ r = requests.get(
470
+ download, headers={"User-Agent": random.choice(User_Agent)}, timeout=30
471
+ )
472
+ r.raise_for_status()
473
+ with open(file_path, "wb") as f:
474
+ f.write(r.content)
475
+ downloaded_count += 1
374
476
 
375
477
  return downloaded_count
376
478
 
377
479
 
378
- def query_prospectus(stock_code):
379
- """查询指定股票代码的招股书公告列表"""
480
+ def query_reports(stock_code, report_type="annual", year=None):
481
+ """查询指定股票和报告类型的公告列表。"""
482
+ normalized_type = normalize_report_type(report_type)
380
483
  all_announcements = []
484
+ requested_code = re.sub(r"\D", "", str(stock_code or ""))
485
+ allowed_sec_codes = {requested_code} if requested_code else set()
381
486
 
382
- try:
383
- announcements_sse = _paginate(sseStock, stock_code)
384
- all_announcements.extend(announcements_sse)
385
- except Exception as e:
386
- logger.warning("沪市招股书查询失败: %s", e)
387
-
388
- try:
389
- announcements_szse = _paginate(szseStock, stock_code)
390
- all_announcements.extend(announcements_szse)
391
- except Exception as e:
392
- logger.warning("深市招股书查询失败: %s", e)
393
-
394
- prospectus_keywords = ["招股书", "招股说明书", "招股意向书"]
395
- filtered = [
396
- a
397
- for a in all_announcements
398
- if any(kw in a.get("announcementTitle", "") for kw in prospectus_keywords)
487
+ exchanges = [
488
+ ("sse", "sh", "沪市"),
489
+ ("szse", "sz", "深市"),
399
490
  ]
491
+ for column, plate, label in exchanges:
492
+ try:
493
+ fetch_fn = lambda page, _stock, c=column, p=plate: _query_exchange_report(
494
+ page, stock_code, normalized_type, c, p
495
+ )
496
+ all_announcements.extend(_paginate(fetch_fn, stock_code))
497
+ except Exception as e:
498
+ logger.warning(
499
+ "%s%s查询失败: %s",
500
+ label,
501
+ REPORT_TYPE_SPECS[normalized_type]["label"],
502
+ e,
503
+ )
400
504
 
401
- return filtered
402
-
403
-
404
- def download_prospectus(stock_code, save_path=None):
405
- """下载指定股票的招股书"""
406
- announcements = query_prospectus(stock_code)
407
-
408
- if not announcements:
409
- return {
410
- "success": False,
411
- "message": f"未找到股票 {stock_code} 的招股书",
412
- "downloaded": 0,
413
- }
414
-
415
- output_dir = save_path or saving_path
416
- count = Download(announcements, save_path=output_dir)
417
-
418
- downloaded = count or 0
419
- return {
420
- "success": downloaded > 0,
421
- "message": f"已下载 {stock_code} 招股书,共 {downloaded} 个文件"
422
- if downloaded > 0
423
- else f"未下载任何文件({stock_code} 招股书)",
424
- "downloaded": downloaded,
425
- "path": output_dir,
426
- }
427
-
428
-
429
- def query_annual_reports(stock_code, year=None):
430
- """查询指定股票的年度报告列表"""
431
- all_announcements = []
432
-
433
- # 查询沪市
434
- try:
435
- announcements_sse = _paginate(sseAnnual, stock_code)
436
- all_announcements.extend(announcements_sse)
437
- except Exception as e:
438
- logger.warning("沪市年报查询失败: %s", e)
439
-
440
- # 查询深市
441
- try:
442
- announcements_szse = _paginate(szseAnnual, stock_code)
443
- all_announcements.extend(announcements_szse)
444
- except Exception as e:
445
- logger.warning("深市年报查询失败: %s", e)
446
-
447
- # 查询北交所(代码以 4/8/9 开头)。北交所接口必须用 orgId,
448
- # 故先解析 orgId 再以 stock="代码,orgId" 翻页查询。
449
505
  if _is_bse_code(stock_code):
450
506
  try:
451
507
  resolved = _resolve_org_id(stock_code)
452
508
  if resolved:
453
509
  code, org_id = resolved
454
- announcements_bse = _paginate(bseAnnual, f"{code},{org_id}")
455
- all_announcements.extend(announcements_bse)
510
+ allowed_sec_codes.add(code)
511
+ stock_value = f"{code},{org_id}"
512
+ fetch_fn = lambda page, _stock: _query_exchange_report(
513
+ page,
514
+ code,
515
+ normalized_type,
516
+ "bj",
517
+ "bj",
518
+ stock_value=stock_value,
519
+ )
520
+ all_announcements.extend(_paginate(fetch_fn, stock_value))
456
521
  except Exception as e:
457
- logger.warning("北交所年报查询失败: %s", e)
458
-
459
- # 按年份过滤
460
- if year:
461
- year_expr = re.escape(str(year))
462
- year_patterns = [
463
- rf"{year_expr}年年度报告",
464
- rf"{year_expr}年度报告",
465
- rf"{year_expr}年报",
466
- ]
467
- filtered = []
468
- for announcement in all_announcements:
469
- title = re.sub(r"\s+", "", announcement.get("announcementTitle", ""))
470
- # 这里故意使用宽松匹配作为“预筛选”以保留候选项。
471
- # 真正的严格判定(fullmatch + 排除词)在 Download() 的
472
- # _is_annual_report_title() 中执行,形成两层防线。
473
- if any(re.search(pattern, title) for pattern in year_patterns):
474
- filtered.append(announcement)
475
- all_announcements = filtered
476
-
477
- return all_announcements
522
+ logger.warning(
523
+ "北交所%s查询失败: %s",
524
+ REPORT_TYPE_SPECS[normalized_type]["label"],
525
+ e,
526
+ )
478
527
 
528
+ filtered = []
529
+ seen = set()
530
+ for announcement in all_announcements:
531
+ title = announcement.get("announcementTitle", "")
532
+ adjunct_url = announcement.get("adjunctUrl", "")
533
+ sec_code = str(announcement.get("secCode", ""))
534
+ if allowed_sec_codes and sec_code not in allowed_sec_codes:
535
+ continue
536
+ dedupe_key = (announcement.get("secCode"), title, adjunct_url)
537
+ if dedupe_key in seen:
538
+ continue
539
+ seen.add(dedupe_key)
479
540
 
480
- def download_annual_reports(stock_code, year=None, save_path=None):
481
- """下载指定股票的年度报告"""
482
- announcements = query_annual_reports(stock_code, year)
541
+ if not _is_report_title(title, normalized_type, year_filter=year):
542
+ continue
543
+ # 招股书标题不含年份,需按 announcementTime 另行核对;其余类型的年份
544
+ # 已在 _is_report_title 内匹配,无需重复。
545
+ if normalized_type == "prospectus" and not _matches_year(
546
+ announcement, normalized_type, year
547
+ ):
548
+ continue
549
+ filtered.append(announcement)
550
+
551
+ return filtered
552
+
553
+
554
+ def download_reports(stock_code, report_type="annual", year=None, save_path=None):
555
+ """下载指定股票和报告类型的 PDF。"""
556
+ normalized_type = normalize_report_type(report_type)
557
+ label = REPORT_TYPE_SPECS[normalized_type]["label"]
558
+ announcements = query_reports(stock_code, normalized_type, year)
483
559
 
484
560
  if not announcements:
485
561
  return {
486
562
  "success": False,
487
- "message": f"未找到股票 {stock_code} 的年度报告"
563
+ "message": f"未找到股票 {stock_code} 的{label}"
488
564
  + (f"({year} 年)" if year else ""),
489
565
  "downloaded": 0,
490
566
  }
491
567
 
492
568
  output_dir = save_path or saving_path
493
- count = Download(announcements, year_filter=year, save_path=output_dir)
569
+ count = Download(
570
+ announcements,
571
+ report_type=normalized_type,
572
+ year_filter=year,
573
+ save_path=output_dir,
574
+ )
494
575
 
495
576
  downloaded = count or 0
496
577
  year_suffix = f"({year} 年)" if year else ""
497
578
  return {
498
579
  "success": downloaded > 0,
499
- "message": f"已下载 {stock_code} 年度报告{year_suffix},共 {downloaded} 个文件"
580
+ "message": f"已下载 {stock_code} {label}{year_suffix},共 {downloaded} 个文件"
500
581
  if downloaded > 0
501
- else f"未下载任何文件({stock_code} 年度报告{year_suffix})",
582
+ else f"未下载任何文件({stock_code} {label}{year_suffix})",
502
583
  "downloaded": downloaded,
503
584
  "path": output_dir,
504
585
  }
505
586
 
506
587
 
588
+ def query_annual_reports(stock_code, year=None):
589
+ """查询指定股票的年度报告列表。"""
590
+ return query_reports(stock_code, "annual", year)
591
+
592
+
593
+ def download_annual_reports(stock_code, year=None, save_path=None):
594
+ """下载指定股票的年度报告。"""
595
+ return download_reports(stock_code, "annual", year=year, save_path=save_path)
596
+
597
+
507
598
  def Run(page_number, stock):
508
599
  annual_report = []
509
600
  stock_report = []
@@ -521,10 +612,10 @@ def Run(page_number, stock):
521
612
  annual_report = szseAnnual(page_number, stock)
522
613
  except Exception:
523
614
  logger.warning("%s page error", page_number)
524
- Download(annual_report)
525
- Download(stock_report)
526
- Download(annual_report_)
527
- Download(stock_report_)
615
+ Download(annual_report, report_type="annual")
616
+ Download(stock_report, report_type="prospectus")
617
+ Download(annual_report_, report_type="annual")
618
+ Download(stock_report_, report_type="prospectus")
528
619
 
529
620
 
530
621
  if __name__ == "__main__":
@@ -534,6 +625,6 @@ if __name__ == "__main__":
534
625
  with open("company_id.txt") as file:
535
626
  lines = file.readlines()
536
627
  for line in lines:
537
- stock = line
538
- Run(1, line)
539
- logger.info("%s done", line.strip())
628
+ stock = line.strip()
629
+ Run(1, stock)
630
+ logger.info("%s done", stock)