yaohao 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +163 -0
  3. package/bin/yaohao.js +47 -0
  4. package/package.json +53 -0
  5. package/skills/yaohao/SKILL.md +128 -0
  6. package/src/commands/calendar.js +29 -0
  7. package/src/commands/cron.js +87 -0
  8. package/src/commands/eligibility.js +28 -0
  9. package/src/commands/family.js +17 -0
  10. package/src/commands/history.js +17 -0
  11. package/src/commands/init.js +102 -0
  12. package/src/commands/market.js +40 -0
  13. package/src/commands/notify.js +92 -0
  14. package/src/commands/result.js +18 -0
  15. package/src/commands/set.js +53 -0
  16. package/src/commands/status.js +17 -0
  17. package/src/commands/waitlist.js +17 -0
  18. package/src/commands/watch.js +159 -0
  19. package/src/constants.js +18 -0
  20. package/src/lib/config-manager.js +67 -0
  21. package/src/lib/notifier.js +190 -0
  22. package/src/output.js +15 -0
  23. package/src/source/_shared/crawl.js +169 -0
  24. package/src/source/_shared/parseUtils.js +141 -0
  25. package/src/source/_shared/titleClassify.js +30 -0
  26. package/src/source/beijing/calendar.js +65 -0
  27. package/src/source/beijing/constants.js +8 -0
  28. package/src/source/beijing/crawl.js +156 -0
  29. package/src/source/beijing/eligibility.js +110 -0
  30. package/src/source/beijing/index.js +23 -0
  31. package/src/source/beijing/parse.js +206 -0
  32. package/src/source/beijing/pdfExtract.js +41 -0
  33. package/src/source/beijing/service.js +190 -0
  34. package/src/source/guangzhou/calendar.js +54 -0
  35. package/src/source/guangzhou/constants.js +16 -0
  36. package/src/source/guangzhou/eligibility.js +88 -0
  37. package/src/source/guangzhou/index.js +22 -0
  38. package/src/source/guangzhou/parse.js +61 -0
  39. package/src/source/guangzhou/service.js +126 -0
  40. package/src/source/hangzhou/calendar.js +60 -0
  41. package/src/source/hangzhou/constants.js +16 -0
  42. package/src/source/hangzhou/eligibility.js +102 -0
  43. package/src/source/hangzhou/index.js +20 -0
  44. package/src/source/hangzhou/parse.js +59 -0
  45. package/src/source/hangzhou/service.js +122 -0
  46. package/src/source/index.js +54 -0
  47. package/src/source/shenzhen/calendar.js +44 -0
  48. package/src/source/shenzhen/constants.js +14 -0
  49. package/src/source/shenzhen/eligibility.js +90 -0
  50. package/src/source/shenzhen/index.js +20 -0
  51. package/src/source/shenzhen/parse.js +58 -0
  52. package/src/source/shenzhen/service.js +122 -0
@@ -0,0 +1,156 @@
1
+ // 北京交通委公告站抓取层:负责 HTTP 请求、编码处理、本地缓存。
2
+ // 零外部依赖,依赖 Node 18+ 自带的 fetch。
3
+
4
+ import fs from 'node:fs/promises';
5
+ import path from 'node:path';
6
+ import crypto from 'node:crypto';
7
+ import { homedir } from 'node:os';
8
+
9
+ const DEFAULT_HEADERS = {
10
+ 'User-Agent':
11
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' +
12
+ '(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
13
+ Accept:
14
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
15
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
16
+ };
17
+
18
+ const CACHE_DIR = path.join(homedir(), '.yaohao', 'cache');
19
+
20
+ /**
21
+ * 抓取一个 URL,返回 { url, status, html, contentType, fromCache, elapsedMs }
22
+ *
23
+ * @param {string} url
24
+ * @param {object} [opts]
25
+ * @param {boolean} [opts.useCache=true] 本地缓存,避免反复打官网
26
+ * @param {number} [opts.timeoutMs=15000]
27
+ * @param {number} [opts.retries=2]
28
+ * @param {number} [opts.maxCacheAgeMs] 缓存最大有效期(毫秒),默认无限
29
+ */
30
+ export async function fetchHtml(url, opts = {}) {
31
+ const { useCache = true, timeoutMs = 15000, retries = 2, maxCacheAgeMs } = opts;
32
+
33
+ if (useCache) {
34
+ const cached = await readCache(url, maxCacheAgeMs);
35
+ if (cached) {
36
+ return { ...cached, fromCache: true };
37
+ }
38
+ }
39
+
40
+ let lastErr = null;
41
+ for (let attempt = 0; attempt <= retries; attempt++) {
42
+ const t0 = Date.now();
43
+ const ac = new AbortController();
44
+ const timer = setTimeout(() => ac.abort(), timeoutMs);
45
+ try {
46
+ const res = await fetch(url, {
47
+ headers: DEFAULT_HEADERS,
48
+ signal: ac.signal,
49
+ redirect: 'follow',
50
+ });
51
+ const contentType = res.headers.get('content-type') || '';
52
+ const buf = Buffer.from(await res.arrayBuffer());
53
+ const html = decode(buf, contentType);
54
+ const elapsedMs = Date.now() - t0;
55
+ const payload = {
56
+ url,
57
+ status: res.status,
58
+ contentType,
59
+ html,
60
+ elapsedMs,
61
+ fromCache: false,
62
+ savedAt: Date.now(),
63
+ };
64
+ if (useCache && res.status === 200) {
65
+ await writeCache(url, payload);
66
+ }
67
+ return payload;
68
+ } catch (err) {
69
+ lastErr = err;
70
+ if (attempt < retries) {
71
+ await sleep(500 * (attempt + 1));
72
+ }
73
+ } finally {
74
+ clearTimeout(timer);
75
+ }
76
+ }
77
+ throw new Error(`fetchHtml(${url}) failed after ${retries + 1} tries: ${lastErr?.message || lastErr}`);
78
+ }
79
+
80
+ /**
81
+ * 抓取一个二进制资源(PDF 等),返回 { url, status, buffer }。
82
+ * 走单独缓存目录 ~/.yaohao/cache/pdf/,按 url sha1 命名。
83
+ */
84
+ export async function fetchBinary(url, opts = {}) {
85
+ const { useCache = true, timeoutMs = 30000 } = opts;
86
+ const pdfDir = path.join(CACHE_DIR, 'binary');
87
+ await fs.mkdir(pdfDir, { recursive: true });
88
+ const file = path.join(pdfDir, crypto.createHash('sha1').update(url).digest('hex') + path.extname(url));
89
+
90
+ if (useCache) {
91
+ try {
92
+ const buf = await fs.readFile(file);
93
+ return { url, status: 200, buffer: buf, fromCache: true };
94
+ } catch {
95
+ /* miss */
96
+ }
97
+ }
98
+
99
+ const ac = new AbortController();
100
+ const timer = setTimeout(() => ac.abort(), timeoutMs);
101
+ try {
102
+ const res = await fetch(url, { headers: DEFAULT_HEADERS, signal: ac.signal });
103
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
104
+ const buf = Buffer.from(await res.arrayBuffer());
105
+ await fs.writeFile(file, buf);
106
+ return { url, status: res.status, buffer: buf, fromCache: false };
107
+ } finally {
108
+ clearTimeout(timer);
109
+ }
110
+ }
111
+
112
+ function decode(buf, contentType) {
113
+ let charset = 'utf-8';
114
+ const m = /charset=([\w-]+)/i.exec(contentType);
115
+ if (m) charset = m[1].toLowerCase();
116
+ const head = buf.slice(0, 1024).toString('ascii');
117
+ const metaMatch = /charset=["']?([\w-]+)/i.exec(head);
118
+ if (metaMatch) charset = metaMatch[1].toLowerCase();
119
+
120
+ if (charset === 'utf-8' || charset === 'utf8') {
121
+ return buf.toString('utf8');
122
+ }
123
+ try {
124
+ return new TextDecoder(charset).decode(buf);
125
+ } catch {
126
+ return buf.toString('utf8');
127
+ }
128
+ }
129
+
130
+ function sleep(ms) {
131
+ return new Promise((r) => setTimeout(r, ms));
132
+ }
133
+
134
+ function cacheKey(url) {
135
+ return crypto.createHash('sha1').update(url).digest('hex') + '.json';
136
+ }
137
+
138
+ async function readCache(url, maxAgeMs) {
139
+ try {
140
+ const file = path.join(CACHE_DIR, cacheKey(url));
141
+ const raw = await fs.readFile(file, 'utf8');
142
+ const payload = JSON.parse(raw);
143
+ if (maxAgeMs && payload.savedAt && Date.now() - payload.savedAt > maxAgeMs) {
144
+ return null;
145
+ }
146
+ return payload;
147
+ } catch {
148
+ return null;
149
+ }
150
+ }
151
+
152
+ async function writeCache(url, payload) {
153
+ await fs.mkdir(CACHE_DIR, { recursive: true });
154
+ const file = path.join(CACHE_DIR, cacheKey(url));
155
+ await fs.writeFile(file, JSON.stringify(payload), 'utf8');
156
+ }
@@ -0,0 +1,110 @@
1
+ import { input, select, confirm } from '@inquirer/prompts';
2
+
3
+ // 资格判定基于《北京市小客车数量调控暂行规定》(2025 年修订)
4
+ // 4 种身份类型代码取自官方系统 choosePerson 页:BSHJ / JJ / ZZZ / GZJZZ
5
+
6
+ const PERSON_TYPES = {
7
+ BSHJ: '本市户籍居民',
8
+ JJ: '驻京部队现役军人和现役武警',
9
+ ZZZ: '持有效居住证的非本市户籍人员',
10
+ GZJZZ: '持北京市工作居住证人员',
11
+ };
12
+
13
+ export async function checkEligibility() {
14
+ try {
15
+ const personType = await select({
16
+ message: '户籍/居住类型?',
17
+ choices: [
18
+ ...Object.entries(PERSON_TYPES).map(([k, v]) => ({ name: v, value: k })),
19
+ { name: '以上都不是', value: 'NONE' },
20
+ ],
21
+ });
22
+ if (personType === 'NONE') {
23
+ return result(false, '不符合户籍/居住要求', { reason: 'huji_not_match' });
24
+ }
25
+
26
+ const age = Number(await input({
27
+ message: '年龄?',
28
+ default: '30',
29
+ validate: (v) => /^\d+$/.test(v) && Number(v) > 0 || '请输入正整数',
30
+ }));
31
+ if (age < 18) return result(false, '需年满 18 周岁', { age });
32
+
33
+ const hasLicense = await confirm({ message: '是否持有有效机动车驾驶证?', default: true });
34
+ if (!hasLicense) return result(false, '需持有有效机动车驾驶证');
35
+
36
+ const hasBjPlate = await confirm({ message: '名下是否已有京牌小客车?', default: false });
37
+ if (hasBjPlate) return result(false, '名下已有京牌小客车,须先处理');
38
+
39
+ const hasIndicator = await confirm({ message: '名下是否已持有有效的小客车指标(含指标确认通知书)?', default: false });
40
+ if (hasIndicator) return result(false, '已持有有效指标,不能重复申请');
41
+
42
+ // 非京籍走 5 年社保 + 个税
43
+ if (personType === 'ZZZ') {
44
+ const sbYears = Number(await input({
45
+ message: '在京连续缴纳社保和个税年数(按整年算,断缴会清零)?',
46
+ default: '0',
47
+ validate: (v) => /^\d+(\.\d+)?$/.test(v) || '请输入数字',
48
+ }));
49
+ if (sbYears < 5) {
50
+ return result(false, `非京籍连续社保和个税不足 5 年,还差 ${(5 - sbYears).toFixed(1)} 年`, { sbYears });
51
+ }
52
+ }
53
+
54
+ // 家庭摇号判断
55
+ const considerFamily = await confirm({
56
+ message: '是否考虑家庭摇号(积分制,比个人摇号中签率高)?',
57
+ default: true,
58
+ });
59
+ let familyEligible = false;
60
+ let familySize = 1;
61
+ if (considerFamily) {
62
+ familySize = Number(await input({
63
+ message: '家庭主申请人 + 配偶 + 子女 + 双方父母合计有效成员数?',
64
+ default: '2',
65
+ validate: (v) => /^\d+$/.test(v) || '请输入整数',
66
+ }));
67
+ familyEligible = familySize >= 2;
68
+ }
69
+
70
+ const lines = ['符合申请条件:'];
71
+ lines.push(' ✓ 个人普通指标(PTC)— 阶梯倍率制,参与年限越长倍率越高');
72
+ lines.push(' ✓ 个人新能源指标(XNY)— 排队轮候制');
73
+ if (familyEligible) {
74
+ lines.push(' ✓ 家庭普通指标(PTC)— 家庭积分制,比个人中签率高');
75
+ lines.push(' ✓ 家庭新能源指标(XNY)— 家庭积分轮候');
76
+ } else if (considerFamily) {
77
+ lines.push(' ✗ 家庭摇号:需至少 2 名家庭成员');
78
+ }
79
+ lines.push('');
80
+ lines.push('下一步:在申请窗口期(每年 1/1-3/8、8/1-10/8)登录 xkczb.jtw.beijing.gov.cn 提交申请。');
81
+ lines.push('提示:本判定为简化版,复杂情况以官方说明为准。');
82
+
83
+ return result(true, lines.join('\n'), {
84
+ personType,
85
+ personTypeLabel: PERSON_TYPES[personType],
86
+ age,
87
+ familySize,
88
+ eligibility: {
89
+ personal_ptc: true,
90
+ personal_xny: true,
91
+ family_ptc: familyEligible,
92
+ family_xny: familyEligible,
93
+ },
94
+ });
95
+ } catch (err) {
96
+ if (err && err.name === 'ExitPromptError') {
97
+ return { pass: null, cancelled: true, lines: ['已取消'] };
98
+ }
99
+ throw err;
100
+ }
101
+ }
102
+
103
+ function result(pass, message, extra = {}) {
104
+ return {
105
+ pass,
106
+ message,
107
+ ...extra,
108
+ lines: [pass ? message : `不符合:${message}`],
109
+ };
110
+ }
@@ -0,0 +1,23 @@
1
+ // 北京小客车摇号 source 入口
2
+ // 统一接口规范:meta / getCalendar / checkEligibility / extractMarketMetrics / watchTargets
3
+
4
+ import { SYSTEM_URL } from './constants.js';
5
+ import { getCalendar } from './calendar.js';
6
+ import { checkEligibility } from './eligibility.js';
7
+ import { extractMarketMetrics, watchTargets } from './service.js';
8
+
9
+ export const meta = {
10
+ name: 'beijing',
11
+ label: '北京',
12
+ systemUrl: SYSTEM_URL,
13
+ applyTypes: ['person', 'family'],
14
+ regTypes: ['PTC', 'XNY'],
15
+ supported: true,
16
+ };
17
+
18
+ export {
19
+ getCalendar,
20
+ checkEligibility,
21
+ extractMarketMetrics,
22
+ watchTargets,
23
+ };
@@ -0,0 +1,206 @@
1
+ // 北京交通委公告站解析层:把 HTML 字符串转成结构化数据。
2
+ // 零依赖,纯正则。
3
+
4
+ /* ---------- 通用工具 ---------- */
5
+
6
+ export function stripTags(html) {
7
+ if (!html) return '';
8
+ let s = html.replace(/<script[\s\S]*?<\/script>/gi, '')
9
+ .replace(/<style[\s\S]*?<\/style>/gi, '');
10
+ s = s.replace(/<br\s*\/?>/gi, '\n');
11
+ s = s.replace(/<[^>]+>/g, '');
12
+ s = s.replace(/&nbsp;/g, ' ')
13
+ .replace(/&amp;/g, '&')
14
+ .replace(/&lt;/g, '<')
15
+ .replace(/&gt;/g, '>')
16
+ .replace(/&quot;/g, '"')
17
+ .replace(/&ldquo;|&rdquo;/g, '"')
18
+ .replace(/&lsquo;|&rsquo;/g, "'")
19
+ .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(Number(n)));
20
+ s = s.replace(/[ \t]+/g, ' ')
21
+ .replace(/\n\s*\n+/g, '\n')
22
+ .trim();
23
+ return s;
24
+ }
25
+
26
+ export function normalizeNumberLike(s) {
27
+ return s
28
+ .replace(/[0-9]/g, (c) => String.fromCharCode(c.charCodeAt(0) - 0xfee0))
29
+ .replace(/[,]/g, ',')
30
+ .replace(/[.]/g, '.')
31
+ .replace(/[%]/g, '%');
32
+ }
33
+
34
+ /* ---------- 列表页:jggb / xwzz / bszn 同一套模板 ---------- */
35
+
36
+ export function parseListPage(html, baseUrl) {
37
+ const items = [];
38
+ const sectionMatch = /<div\s+class="subpage_list">([\s\S]*?)<\/div>\s*(?:<div\s+class="pageturn"|<\/div>)/i.exec(html);
39
+ const scope = sectionMatch ? sectionMatch[1] : html;
40
+
41
+ const ddRe = /<dd[^>]*>([\s\S]*?)<\/dd>/gi;
42
+ for (const m of scope.matchAll(ddRe)) {
43
+ const block = m[1];
44
+ const aMatch = /<a\s+class="text"\s+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i.exec(block);
45
+ const dMatch = /<span\s+class="date">([\s\S]*?)<\/span>/i.exec(block);
46
+ if (!aMatch) continue;
47
+ const rawTitle = stripTags(aMatch[2]);
48
+ const url = absUrl(aMatch[1], baseUrl);
49
+ const date = dMatch ? stripTags(dMatch[1]) : null;
50
+ const period = extractPeriod(rawTitle);
51
+ const kind = classifyTitle(rawTitle);
52
+ items.push({ title: rawTitle, date, url, period, kind });
53
+ }
54
+ return items;
55
+ }
56
+
57
+ export function parsePagination(html, baseUrl) {
58
+ const block = /<div\s+class="pageturn">([\s\S]*?)<\/div>/i.exec(html);
59
+ if (!block) return { current: 1, totalPages: 1 };
60
+ const seg = block[1];
61
+ const cur = /当前第(\d+)页/.exec(seg);
62
+ const total = /共(\d+)页/.exec(seg);
63
+ const next = /<a\s+href="([^"]+)"[^>]*>下一页<\/a>/i.exec(seg);
64
+ return {
65
+ current: cur ? Number(cur[1]) : 1,
66
+ totalPages: total ? Number(total[1]) : 1,
67
+ nextUrl: next ? absUrl(next[1], baseUrl) : null,
68
+ };
69
+ }
70
+
71
+ function extractPeriod(title) {
72
+ const m = /(20\d{2})年(?:第)?\s*([一二三四五六七八九十0-9]+)\s*期/.exec(title);
73
+ if (!m) return null;
74
+ const cnNum = { 一: 1, 二: 2, 三: 3, 四: 4, 五: 5, 六: 6, 七: 7, 八: 8, 九: 9, 十: 10 };
75
+ const year = Number(m[1]);
76
+ const rawNo = m[2];
77
+ const no = /^\d+$/.test(rawNo) ? Number(rawNo) : cnNum[rawNo] ?? null;
78
+ return no == null ? null : { year, no, label: `${year}年第${no}期` };
79
+ }
80
+
81
+ function classifyTitle(t) {
82
+ if (/配置结果/.test(t)) return 'result';
83
+ if (/申请审核结果和配置工作/.test(t)) return 'config_notice';
84
+ if (/亲属关系和婚姻状况核查/.test(t)) return 'family_check';
85
+ if (/资格审核结果/.test(t)) return 'qualify_review';
86
+ if (/指标配额/.test(t)) return 'quota';
87
+ if (/买卖|出租|承租|出借|借用/.test(t)) return 'penalty';
88
+ if (/十问十答|温馨提示/.test(t)) return 'faq';
89
+ return 'other';
90
+ }
91
+
92
+ /* ---------- 详情页:subpage_article ---------- */
93
+
94
+ export function parseDetailPage(html, url) {
95
+ const article =
96
+ /<div\s+class="subpage_article">([\s\S]*?)<div\s+class="clearboth">/i.exec(html);
97
+ const scope = article ? article[1] : html;
98
+
99
+ const titleM = /<h2[^>]*>([\s\S]*?)<\/h2>/i.exec(scope);
100
+ const title = titleM ? stripTags(titleM[1]) : null;
101
+
102
+ const h4M = /<h4[^>]*>([\s\S]*?)<\/h4>/i.exec(scope);
103
+ const h4Text = h4M ? stripTags(h4M[1]) : '';
104
+ const dateText = /发布日期[::]\s*([0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日)/.exec(h4Text)?.[1] || null;
105
+ const dateIso = dateText ? cnDateToIso(dateText) : null;
106
+
107
+ const attachments = [];
108
+ const aRe = /<a[^>]+href="([^"]+\.(?:pdf|docx?|xlsx?|zip))"[^>]*>([\s\S]*?)<\/a>/gi;
109
+ for (const m of scope.matchAll(aRe)) {
110
+ attachments.push({
111
+ url: absUrl(m[1], url),
112
+ name: stripTags(m[2]).replace(/^下载/, '').trim() || basename(m[1]),
113
+ });
114
+ }
115
+
116
+ const bodyText = stripTags(scope);
117
+ const metrics = extractMetricsFromText(bodyText);
118
+
119
+ return {
120
+ url,
121
+ title,
122
+ dateText,
123
+ dateIso,
124
+ bodyText,
125
+ attachments,
126
+ metrics,
127
+ };
128
+ }
129
+
130
+ /* ---------- 字段抽取器:从中文段落里捞数字 ---------- */
131
+
132
+ export function extractMetricsFromText(rawText) {
133
+ const text = normalizeNumberLike(rawText.replace(/\s+/g, ' '));
134
+ const r = {};
135
+
136
+ const fam = /家庭普通小客车指标申请共?计?\s*([\d,]+)\s*个?有效编码/.exec(text);
137
+ if (fam) r.familyApplyCount = num(fam[1]);
138
+ const per = /个人普通小客车指标申请共?计?\s*([\d,]+)\s*个?有效编码/.exec(text);
139
+ if (per) r.personalApplyCount = num(per[1]);
140
+ const uni = /单位普通小客车指标申请共?计?\s*([\d,]+)\s*家/.exec(text);
141
+ if (uni) r.unitApplyCount = num(uni[1]);
142
+
143
+ const famPerAlloc = /家庭(?:和|与)个人普通小客车指标共?计?\s*([\d,]+)\s*个/.exec(text);
144
+ if (famPerAlloc) r.familyPersonAlloc = num(famPerAlloc[1]);
145
+ const unitAlloc = /配置单位普通小客车指标\s*([\d,]+)\s*个/.exec(text);
146
+ if (unitAlloc) r.unitAlloc = num(unitAlloc[1]);
147
+
148
+ const nevFam = /家庭新能源小客车指标申请共?计?\s*([\d,]+)/.exec(text);
149
+ if (nevFam) r.familyNevApplyCount = num(nevFam[1]);
150
+ const nevPer = /个人新能源小客车指标申请共?计?\s*([\d,]+)/.exec(text);
151
+ if (nevPer) r.personalNevApplyCount = num(nevPer[1]);
152
+ const nevWait =
153
+ /新能源(?:小客车)?指标(?:轮候|排队)(?:.*?)(?:约|预计)?\s*([\d.]+)\s*年/.exec(text);
154
+ if (nevWait) r.nevWaitYears = Number(nevWait[1]);
155
+
156
+ const minScore =
157
+ /家庭(?:摇号)?(?:最低)?中签(?:家庭)?(?:积分|分值)(?:为|是|:|:)\s*([\d.]+)/.exec(text) ||
158
+ /家庭(?:摇号)?(?:积分|分值)(?:最低|不低于).*?([\d.]+)\s*分/.exec(text);
159
+ if (minScore) r.familyMinScore = Number(minScore[1]);
160
+
161
+ const pRate = /个人(?:普通(?:小客车)?指标)?中签率(?:约)?(?:为|是|:|:)?\s*([\d.]+%|1[//]\d+)/.exec(text);
162
+ if (pRate) r.personalRate = pRate[1];
163
+ const uRate = /单位(?:普通(?:小客车)?指标)?中签率(?:约)?(?:为|是|:|:)?\s*([\d.]+%|1[//]\d+)/.exec(text);
164
+ if (uRate) r.unitRate = uRate[1];
165
+
166
+ const bl = /共有\s*([\d,]+)\s*个失信被执行人/.exec(text);
167
+ if (bl) r.blackListCount = num(bl[1]);
168
+
169
+ // 配置结果 PDF 里的关键字段
170
+ const validEncode = /有效编码总数[::]\s*([\d,]+)/.exec(text);
171
+ if (validEncode) r.validEncodeCount = num(validEncode[1]);
172
+ const baseSeed = /基数序号总数[::]\s*([\d,]+)/.exec(text);
173
+ if (baseSeed) r.baseSeedTotal = num(baseSeed[1]);
174
+ const allocTotal = /指标配置总数[::]\s*([\d,]+)/.exec(text);
175
+ if (allocTotal) r.allocTotal = num(allocTotal[1]);
176
+ const randSeed = /指标配置种子数[::]\s*([\d,]+)/.exec(text);
177
+ if (randSeed) r.randomSeed = num(randSeed[1]);
178
+
179
+ return r;
180
+ }
181
+
182
+ /* ---------- 小工具 ---------- */
183
+
184
+ function num(s) {
185
+ return Number(String(s).replace(/,/g, ''));
186
+ }
187
+
188
+ function cnDateToIso(s) {
189
+ const m = /(\d{4})年(\d{1,2})月(\d{1,2})日/.exec(s);
190
+ if (!m) return null;
191
+ return `${m[1]}-${m[2].padStart(2, '0')}-${m[3].padStart(2, '0')}`;
192
+ }
193
+
194
+ function absUrl(href, base) {
195
+ if (!href) return href;
196
+ if (/^https?:\/\//i.test(href)) return href;
197
+ try {
198
+ return new URL(href, base).toString();
199
+ } catch {
200
+ return href;
201
+ }
202
+ }
203
+
204
+ function basename(p) {
205
+ return String(p).split(/[\\/]/).pop() || p;
206
+ }
@@ -0,0 +1,41 @@
1
+ // PDF 解析模块:把 attachment 里的 PDF 转纯文本,复用 parse.js 的 extractMetricsFromText。
2
+ // 用 pdfjs-dist(纯 JS,无原生依赖)。
3
+
4
+ import { fetchBinary } from './crawl.js';
5
+ import { extractMetricsFromText } from './parse.js';
6
+
7
+ let _pdfjs = null;
8
+
9
+ async function getPdfjs() {
10
+ if (!_pdfjs) {
11
+ _pdfjs = await import('pdfjs-dist/legacy/build/pdf.mjs');
12
+ }
13
+ return _pdfjs;
14
+ }
15
+
16
+ /**
17
+ * 抓取 PDF 并抽取所有页面的纯文本。
18
+ * @returns {Promise<string>}
19
+ */
20
+ export async function extractPdfText(pdfUrl) {
21
+ const pdfjs = await getPdfjs();
22
+ const { buffer } = await fetchBinary(pdfUrl);
23
+ const doc = await pdfjs.getDocument({ data: new Uint8Array(buffer) }).promise;
24
+ const pages = [];
25
+ for (let i = 1; i <= doc.numPages; i++) {
26
+ const page = await doc.getPage(i);
27
+ const content = await page.getTextContent();
28
+ pages.push(content.items.map((it) => it.str).join(' '));
29
+ }
30
+ await doc.destroy();
31
+ return pages.join('\n');
32
+ }
33
+
34
+ /**
35
+ * 抓取 PDF 并抽取关键指标(中签率、家庭最低分等)。
36
+ * @returns {Promise<object>}
37
+ */
38
+ export async function extractPdfMetrics(pdfUrl) {
39
+ const text = await extractPdfText(pdfUrl);
40
+ return extractMetricsFromText(text);
41
+ }