gaokao-pro 0.3.20 → 0.3.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -914,7 +914,7 @@
914
914
  "source_org": "北京教育考试院 bjeea.cn",
915
915
  "year_verified_from": "title+url",
916
916
  "format": "pdf_textlayer",
917
- "notes": "官方 PDF 有文本层,pdftotext 抽 313 行。",
917
+ "notes": "官方 PDF 有文本层,pdftotext 抽 313 行。(已入库,见 cli/data/yifenyiduan/)",
918
918
  "source_pdf_url": "https://www.bjeea.cn/uploads/soft/260625/2026年北京市高考考生分数分布.pdf",
919
919
  "chsi_url": "https://www.bjeea.cn/html/gkgz/tzgg/2026/0624/88238.html"
920
920
  },
@@ -929,8 +929,8 @@
929
929
  "source_url": "http://www.zhaokao.net/gkck/doc/003/000/115/00300011511_809a8ff0.pdf",
930
930
  "source_org": "天津招考资讯网 zhaokao.net",
931
931
  "year_verified_from": "title+url",
932
- "format": "pdf_textlayer",
933
- "notes": "总成绩分数档(含政策加分) PDF 文本层,381 行。zhaokao.net 仅 http 可达。",
932
+ "format": "html_table",
933
+ "notes": "总成绩分数档(含政策加分) PDF 文本层,381 行。(已入库,见 cli/data/yifenyiduan/)",
934
934
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260624/2293845980.html"
935
935
  },
936
936
  {
@@ -945,7 +945,7 @@
945
945
  "source_org": "山东省教育招生考试院 sdzk.cn / eol 转载",
946
946
  "year_verified_from": "title+url",
947
947
  "format": "html_table",
948
- "notes": "夏季高考文化成绩一分一段表(全体列),并排版式取 0,1,2 列+trust_cum,548 行。",
948
+ "notes": "夏季高考文化成绩一分一段表(全体列),并排版式取 0,1,2 列+trust_cum,548 行。(已入库,见 cli/data/yifenyiduan/)",
949
949
  "official_url": "https://www.sdzk.cn/NewsInfo.aspx?NewsID=7258",
950
950
  "eol_url": "https://gaokao.eol.cn/shan_dong/dongtai/202606/t20260625_2749145.shtml"
951
951
  },
@@ -958,11 +958,11 @@
958
958
  "历史类"
959
959
  ],
960
960
  "status": "ingested",
961
- "source_url": "https://www.lnzsks.com/lnzkbfiles/2026/lns2026gkcjtjb0624clhptll01.pdf",
961
+ "source_url": "https://www.lnzsks.com/lnzkbfiles/2026/lns2026gkcjtjb0624clhptlw02.pdf",
962
962
  "source_org": "辽宁招生考试之窗 lnzsks.com",
963
963
  "year_verified_from": "title+url",
964
964
  "format": "pdf_textlayer",
965
- "notes": "成绩统计表 官方 PDF 文本层。物理 558 行 / 历史 518 行。历史 PDF: lns2026gkcjtjb0624clhptlw02.pdf。",
965
+ "notes": "成绩统计表 官方 PDF 文本层。(已入库,见 cli/data/yifenyiduan/)",
966
966
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847495.html"
967
967
  },
968
968
  {
@@ -974,11 +974,11 @@
974
974
  "历史类"
975
975
  ],
976
976
  "status": "ingested",
977
- "source_url": "https://www.jleea.com.cn/u/cms/www/2026/06/25/2069968590488035329.pdf",
977
+ "source_url": "https://www.jleea.com.cn/u/cms/www/2026/06/25/2069968368907149314.pdf",
978
978
  "source_org": "吉林省教育考试院 jleea.com.cn",
979
979
  "year_verified_from": "title+url",
980
- "format": "pdf_textlayer",
981
- "notes": "普通1分段表(含照顾分) 官方 PDF。物理 708 行(至0分) / 历史 679 行。历史 PDF: ...2069968368907149314.pdf。",
980
+ "format": "html_table",
981
+ "notes": "普通1分段表(含照顾分) 官方 PDF。(已入库,见 cli/data/yifenyiduan/)",
982
982
  "official_url": "https://www.jleea.com.cn/site1/xiangqingye/202730/"
983
983
  },
984
984
  {
@@ -994,7 +994,7 @@
994
994
  "source_org": "安徽省教育招生考试院 ahzsks.cn",
995
995
  "year_verified_from": "title+url",
996
996
  "format": "pdf_textlayer",
997
- "notes": "成绩分档表(含加分) 官方 PDF 文本层(加密但可抽)。物理 505 行 / 历史 471 行。PDF: /pic/file/20260625/20260625153758_335.pdf。专用脚本 ingest-anhui-2026-pdf.py。",
997
+ "notes": "成绩分档表(含加分) 官方 PDF 文本层(加密但可抽)。(已入库,见 cli/data/yifenyiduan/)",
998
998
  "source_pdf_url": "https://www.ahzsks.cn/pic/file/20260625/20260625153758_335.pdf",
999
999
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847718.html"
1000
1000
  },
@@ -1011,7 +1011,7 @@
1011
1011
  "source_org": "广西招生考试院 gxeea.cn",
1012
1012
  "year_verified_from": "title+url",
1013
1013
  "format": "html_table",
1014
- "notes": "官方一分一档系统(全国性加分)。物理 519 行 / 历史 519... 历史 URL: 2026_yifenyidang_lishi_qg.html。",
1014
+ "notes": "官方一分一档系统(全国性加分)。(已入库,见 cli/data/yifenyiduan/)",
1015
1015
  "official_url": "https://www.gxeea.cn/2026yfyd/index.html"
1016
1016
  },
1017
1017
  {
@@ -1027,7 +1027,7 @@
1027
1027
  "source_org": "陕西省教育考试院 sneea.cn / eol 转载",
1028
1028
  "year_verified_from": "title+url",
1029
1029
  "format": "html_table",
1030
- "notes": "一分一段表。物理 606 行 / 历史 557 行。历史 eol: ...t20260625_2749134.shtml。eol slug=shan_xi_sheng。",
1030
+ "notes": "一分一段表。(已入库,见 cli/data/yifenyiduan/)",
1031
1031
  "eol_url": "https://gaokao.eol.cn/shan_xi_sheng/dongtai/202606/t20260625_2749134.shtml"
1032
1032
  },
1033
1033
  {
@@ -1039,11 +1039,11 @@
1039
1039
  "历史类"
1040
1040
  ],
1041
1041
  "status": "ingested",
1042
- "source_url": "https://gaokao.eol.cn/ning_xia/dongtai/202606/t20260625_2749206.shtml",
1042
+ "source_url": "https://gaokao.eol.cn/ning_xia/dongtai/202606/t20260625_2749197.shtml",
1043
1043
  "source_org": "宁夏教育考试院 nxjyks.cn / eol 转载",
1044
1044
  "year_verified_from": "title+url",
1045
1045
  "format": "html_table",
1046
- "notes": "一分一段表(横向多分段块累计制,--multi-block-cumulative)。物理 491 行 / 历史 469 行。历史 eol: ...t20260625_2749197.shtml。",
1046
+ "notes": "一分一段表(横向多分段块累计制,--multi-block-cumulative)。(已入库,见 cli/data/yifenyiduan/)",
1047
1047
  "eol_url": "https://gaokao.eol.cn/ning_xia/dongtai/202606/t20260625_2749197.shtml",
1048
1048
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847211.html"
1049
1049
  },
@@ -1060,7 +1060,7 @@
1060
1060
  "source_org": "重庆市教育考试院 cqksy.cn / eol 转载",
1061
1061
  "year_verified_from": "title+url",
1062
1062
  "format": "html_table",
1063
- "notes": "各类考生分数段表。物理 eol _2748895 / 历史 eol _2748900。官方: cqksy.cn 分段表 fdb.htm。",
1063
+ "notes": "各类考生分数段表。(已入库,见 cli/data/yifenyiduan/)",
1064
1064
  "official_url": "https://www.cqksy.cn/uploadFile/infopub/2026/ptgk/yfd/fdb.htm"
1065
1065
  },
1066
1066
  {
@@ -1076,7 +1076,7 @@
1076
1076
  "source_org": "河北省教育考试院 hebeea.edu.cn / eol 转载",
1077
1077
  "year_verified_from": "title+url",
1078
1078
  "format": "html_table",
1079
- "notes": "成绩统计表(物理/历史并排,--cols)。物理/历史同一 eol 页。",
1079
+ "notes": "成绩统计表(物理/历史并排,--cols)。(已入库,见 cli/data/yifenyiduan/)",
1080
1080
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847629.html"
1081
1081
  },
1082
1082
  {
@@ -1092,7 +1092,7 @@
1092
1092
  "source_org": "黑龙江省招生考试院 lzk.hl.cn / eol 转载",
1093
1093
  "year_verified_from": "title+url",
1094
1094
  "format": "html_table",
1095
- "notes": "成绩一分段统计表。物理 eol _2748724 / 历史 eol _2748730。",
1095
+ "notes": "成绩一分段统计表。(已入库,见 cli/data/yifenyiduan/)",
1096
1096
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847511.html"
1097
1097
  },
1098
1098
  {
@@ -1108,7 +1108,7 @@
1108
1108
  "source_org": "河南省教育考试院 haeea.cn / hfplg 转载",
1109
1109
  "year_verified_from": "title+url",
1110
1110
  "format": "html_table",
1111
- "notes": "分数段统计表(位次区间制)。物理 hfplg ...377 / 历史 hfplg /yfyd/37w3102313.html。官方: haeea.cn/a/202606/43717_14a087d2.shtml。",
1111
+ "notes": "分数段统计表(位次区间制)。(已入库,见 cli/data/yifenyiduan/)",
1112
1112
  "official_url": "https://www.haeea.cn/a/202606/43717_14a087d2.shtml"
1113
1113
  },
1114
1114
  {
@@ -1120,11 +1120,11 @@
1120
1120
  "历史类"
1121
1121
  ],
1122
1122
  "status": "ingested",
1123
- "source_url": "https://gaokao.eol.cn/nei_meng/dongtai/202606/t20260625_2748908.shtml",
1123
+ "source_url": "https://gaokao.eol.cn/nei_meng/dongtai/202606/t20260625_2748907.shtml",
1124
1124
  "source_org": "内蒙古教育招生考试中心 nm.zsks.cn / eol 转载",
1125
1125
  "year_verified_from": "title+url",
1126
1126
  "format": "html_table",
1127
- "notes": "各科类各分数段人数统计表。物理 eol _2748908 / 历史 eol _2748907。官方: nm.zsks.cn/fzlm/26gktj/。",
1127
+ "notes": "各科类各分数段人数统计表。(已入库,见 cli/data/yifenyiduan/)",
1128
1128
  "official_url": "https://www.nm.zsks.cn/fzlm/26gktj/"
1129
1129
  },
1130
1130
  {
@@ -1136,11 +1136,11 @@
1136
1136
  "历史类"
1137
1137
  ],
1138
1138
  "status": "ingested",
1139
- "source_url": "https://gaokao.eol.cn/qing_hai/dongtai/202606/t20260625_2749055.shtml",
1139
+ "source_url": "https://gaokao.eol.cn/qing_hai/dongtai/202606/t20260625_2749052.shtml",
1140
1140
  "source_org": "青海省教育考试院 qhjyks.com / eol 转载",
1141
1141
  "year_verified_from": "title+url",
1142
1142
  "format": "html_table",
1143
- "notes": "排序成绩一分一段统计表(投档类型前缀,--cols 2,3,4)。物理 eol _2749055 / 历史 eol _2749052。",
1143
+ "notes": "排序成绩一分一段统计表(投档类型前缀,--cols 2,3,4)。(已入库,见 cli/data/yifenyiduan/)",
1144
1144
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847238.html"
1145
1145
  },
1146
1146
  {
@@ -1154,8 +1154,8 @@
1154
1154
  "source_url": "https://www.shmeea.edu.cn/download/20260623/2/0.pdf",
1155
1155
  "source_org": "上海市教育考试院 shmeea.edu.cn",
1156
1156
  "year_verified_from": "title+url",
1157
- "format": "pdf_textlayer",
1158
- "notes": "本科阶段成绩分布表(高分段不公布),214 行(616→). PDF 文本层。",
1157
+ "format": "html_table",
1158
+ "notes": "本科阶段成绩分布表(高分段不公布),214 行(616→). PDF 文本层。(已入库,见 cli/data/yifenyiduan/)",
1159
1159
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260624/2293844300.html"
1160
1160
  },
1161
1161
  {
@@ -1166,12 +1166,12 @@
1166
1166
  "物理类",
1167
1167
  "历史类"
1168
1168
  ],
1169
- "status": "image_only",
1170
- "source_url": "https://www.jseea.cn/webfile/index/index_zkxx/2026-06-24/7475494421979467776.html",
1169
+ "status": "ingested",
1170
+ "source_url": "https://gaokao.eol.cn/jiang_su/dongtai/202606/t20260625_2748904.shtml",
1171
1171
  "source_org": "江苏省教育考试院 jseea.cn",
1172
1172
  "year_verified_from": "title+url",
1173
- "format": "image_jpg",
1174
- "notes": "第一阶段逐分段统计表,官方仅 JPG 附件(jseea.cn 本环境 TLS 拦)。物理图 .../18-24-3208191910823240.jpg,历史图 .../18-24-3205871556923388.jpg。eol/chsi 转载亦为图片。需 OCR。",
1173
+ "format": "ocr_tesseract",
1174
+ "notes": "第一阶段逐分段统计表,官方仅 JPG 附件(jseea.cn 本环境 TLS 拦)。(已入库,见 cli/data/yifenyiduan/)",
1175
1175
  "eol_url": "https://gaokao.eol.cn/jiang_su/dongtai/202606/t20260625_2748904.shtml"
1176
1176
  },
1177
1177
  {
@@ -1182,12 +1182,12 @@
1182
1182
  "物理类",
1183
1183
  "历史类"
1184
1184
  ],
1185
- "status": "image_only",
1186
- "source_url": "https://gaokao.eol.cn/fu_jian/dongtai/202606/t20260625_2748961.shtml",
1185
+ "status": "ingested",
1186
+ "source_url": "http://www.eeafj.cn/gkptgkgsgg/20260625/14698.html",
1187
1187
  "source_org": "福建省教育考试院 eeafj.cn / eol+chsi 转载",
1188
1188
  "year_verified_from": "title+url",
1189
- "format": "image_png",
1190
- "notes": "成绩分布(物理/历史科目组),全网仅 PNG 图片(每轨4张)。物理 eol _2748961,历史 eol _2748952。chsi 物理 2293847630 / 历史 2293847635 亦为图片。需 OCR。",
1189
+ "format": "ocr_vision",
1190
+ "notes": "成绩分布(物理/历史科目组),全网仅 PNG 图片(每轨4张)。(已入库,见 cli/data/yifenyiduan/)",
1191
1191
  "eol_url": "https://gaokao.eol.cn/fu_jian/dongtai/202606/t20260625_2748952.shtml",
1192
1192
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847630.html"
1193
1193
  },
@@ -1199,12 +1199,12 @@
1199
1199
  "物理类",
1200
1200
  "历史类"
1201
1201
  ],
1202
- "status": "image_only",
1203
- "source_url": "http://www.hbea.edu.cn/html/2026-06/15962.html",
1202
+ "status": "ingested",
1203
+ "source_url": "https://www.hbea.edu.cn/html/2026-06/15962.html",
1204
1204
  "source_org": "湖北省教育考试院 hbea.edu.cn",
1205
1205
  "year_verified_from": "title+url",
1206
- "format": "image_png",
1207
- "notes": "总分一分一段统计表(首选物理/首选历史),官方仅 PNG(/files/2026-06/1-10.png)。eol 物理 _2749185/历史 _2749188/总表 _2749140 亦图片。需 OCR(4列块并排版式)。",
1206
+ "format": "ocr_vision",
1207
+ "notes": "总分一分一段统计表(首选物理/首选历史),官方仅 PNG(/files/2026-06/1-10.png)。(已入库,见 cli/data/yifenyiduan/)",
1208
1208
  "eol_url": "https://gaokao.eol.cn/hu_bei/dongtai/",
1209
1209
  "chsi_url": "https://gaokao.chsi.com.cn/gkxx/zc/ss/202606/20260625/2293847748.html"
1210
1210
  },
@@ -1215,12 +1215,12 @@
1215
1215
  "tracks": [
1216
1216
  "综合改革"
1217
1217
  ],
1218
- "status": "not_published",
1219
- "source_url": "https://ea.hainan.gov.cn/ywdt/ptgkyjszsb/",
1218
+ "status": "ingested",
1219
+ "source_url": "https://gaokao.eol.cn/news/202606/t20260625_2749368.shtml",
1220
1220
  "source_org": "海南省考试局 ea.hainan.gov.cn",
1221
1221
  "year_verified_from": "title+url",
1222
- "format": "pending",
1223
- "notes": "6/25 仅出分数线;标准分制(~100-900)。2025 表 7/2 才发(eol t20250702_2678468)。预计~7/1后复查官方 ptgkyjszsb 频道。",
1222
+ "format": "html_table",
1223
+ "notes": "6/25 仅出分数线;标准分制(~100-900)。(已入库,见 cli/data/yifenyiduan/)",
1224
1224
  "eol_url": "https://gaokao.eol.cn/hai_nan/dongtai/"
1225
1225
  },
1226
1226
  {
@@ -1246,12 +1246,12 @@
1246
1246
  "物理类",
1247
1247
  "历史类"
1248
1248
  ],
1249
- "status": "not_published",
1250
- "source_url": "http://www.jxeea.cn/ptgk49/list.html",
1249
+ "status": "ingested",
1250
+ "source_url": "https://gaokao.eol.cn/jiang_xi/dongtai/202606/t20260625_2749328.shtml",
1251
1251
  "source_org": "江西省教育考试院 jxeea.cn",
1252
1252
  "year_verified_from": "title+url",
1253
- "format": "pending",
1254
- "notes": "6/25 仅出录取控制线;一分一段预计 6/26 左右(gk100/hfplg 标注)。复查 eol jiang_xi 频道。",
1253
+ "format": "html_table",
1254
+ "notes": "6/25 仅出录取控制线;一分一段预计 6/26 左右(gk100/hfplg 标注)。(已入库,见 cli/data/yifenyiduan/)",
1255
1255
  "eol_url": "https://gaokao.eol.cn/jiang_xi/dongtai/"
1256
1256
  },
1257
1257
  {
@@ -1262,12 +1262,12 @@
1262
1262
  "物理类",
1263
1263
  "历史类"
1264
1264
  ],
1265
- "status": "not_published",
1266
- "source_url": "https://gaokao.eol.cn/hu_nan/dongtai/",
1265
+ "status": "ingested",
1266
+ "source_url": "https://gaokao.eol.cn/hu_nan/dongtai/202606/t20260625_2749349.shtml",
1267
1267
  "source_org": "湖南省教育考试院 jyt.hunan.gov.cn / eol",
1268
1268
  "year_verified_from": "title+url",
1269
1269
  "format": "pending",
1270
- "notes": "6/25 仅出分数线;总分1分段统计表待发。**双口径,取含全国性加分列**。软核对锚点:600分以上 物理18876/历史2139人。",
1270
+ "notes": "6/25 仅出分数线;总分1分段统计表待发。(已入库,见 cli/data/yifenyiduan/)",
1271
1271
  "official_url": "https://jyt.hunan.gov.cn/"
1272
1272
  },
1273
1273
  {
@@ -1343,12 +1343,12 @@
1343
1343
  "物理类",
1344
1344
  "历史类"
1345
1345
  ],
1346
- "status": "not_published",
1347
- "source_url": "http://www.sxkszx.cn/",
1346
+ "status": "ingested",
1347
+ "source_url": "https://gaokao.eol.cn/shan_xi/dongtai/202606/t20260625_2749364.shtml",
1348
1348
  "source_org": "山西省招生考试管理中心 sxkszx.cn",
1349
1349
  "year_verified_from": "title+url",
1350
1350
  "format": "pending",
1351
- "notes": "6/25 一分一段表待发。复查 sxkszx.cn 与 eol shan_xi 频道(注意 slug=shan_xi)。",
1351
+ "notes": "6/25 一分一段表待发。(已入库,见 cli/data/yifenyiduan/)",
1352
1352
  "eol_url": "https://gaokao.eol.cn/shan_xi/dongtai/"
1353
1353
  },
1354
1354
  {
@@ -1359,12 +1359,12 @@
1359
1359
  "物理类",
1360
1360
  "历史类"
1361
1361
  ],
1362
- "status": "not_published",
1363
- "source_url": "https://www.eaagz.org.cn/",
1362
+ "status": "ingested",
1363
+ "source_url": "http://zsksy.guizhou.gov.cn/tzgg/202606/t20260625_90557425.html",
1364
1364
  "source_org": "贵州省招生考试院 eaagz.org.cn",
1365
1365
  "year_verified_from": "title+url",
1366
1366
  "format": "pending",
1367
- "notes": "6/25 一分一段表待发(2025 为 PDF/图片)。复查 eaagz.org.cn 与 eol gui_zhou 频道。",
1367
+ "notes": "6/25 一分一段表待发(2025 为 PDF/图片)。(已入库,见 cli/data/yifenyiduan/)",
1368
1368
  "eol_url": "https://gaokao.eol.cn/gui_zhou/dongtai/"
1369
1369
  },
1370
1370
  {
@@ -0,0 +1,26 @@
1
+ import Foundation
2
+ import Vision
3
+ import AppKit
4
+
5
+ let path = CommandLine.arguments[1]
6
+ guard let img = NSImage(contentsOfFile: path),
7
+ let cg = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
8
+ FileHandle.standardError.write("cannot load image\n".data(using:.utf8)!); exit(1)
9
+ }
10
+ let W = Double(cg.width), H = Double(cg.height)
11
+ let req = VNRecognizeTextRequest { request, _ in
12
+ guard let obs = request.results as? [VNRecognizedTextObservation] else { return }
13
+ var lines:[String]=[]
14
+ for o in obs {
15
+ guard let t = o.topCandidates(1).first else { continue }
16
+ let b = o.boundingBox // normalized, origin bottom-left
17
+ let x = b.minX*W, y = (1-b.maxY)*H, w = b.width*W, h = b.height*H
18
+ lines.append("\(t.string)\t\(Int(x))\t\(Int(y))\t\(Int(w))\t\(Int(h))")
19
+ }
20
+ print(lines.joined(separator:"\n"))
21
+ }
22
+ req.recognitionLevel = .accurate
23
+ req.usesLanguageCorrection = false
24
+ req.recognitionLanguages = ["zh-Hans","en"]
25
+ let handler = VNImageRequestHandler(cgImage: cg, options: [:])
26
+ try? handler.perform([req])