@parseo/shared 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # @parseo/shared
2
+
3
+ PDF text extraction, document classifier, error classes, and shared utilities for Parseo parsers.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @parseo/shared
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### Text extraction
14
+
15
+ ```typescript
16
+ import { extractLines } from "@parseo/shared";
17
+
18
+ const lines = await extractLines(buffer);
19
+ // TextLine[] with text, position, page number, and bounding boxes
20
+ ```
21
+
22
+ ### Document classification
23
+
24
+ ```typescript
25
+ import { classifyDocument } from "@parseo/shared";
26
+
27
+ const result = classifyDocument(lines);
28
+ // { format: "chase", startPage: 1, skip: 0, confidence: 28 }
29
+
30
+ // Limit to a specific package scope
31
+ classifyDocument(lines, "bank-statements");
32
+ classifyDocument(lines, "credit-reports");
33
+ ```
34
+
35
+ ### Error classes
36
+
37
+ ```typescript
38
+ import {
39
+ ParserError,
40
+ InvalidPDFError,
41
+ UnrecognizedFormatError,
42
+ MissingSectionError,
43
+ ExtractionError,
44
+ } from "@parseo/shared";
45
+ ```
46
+
47
+ | Error | When |
48
+ |---|---|
49
+ | `InvalidPDFError` | Buffer is empty, not a PDF, or encrypted |
50
+ | `UnrecognizedFormatError` | PDF text doesn't match expected provider |
51
+ | `MissingSectionError` | Format matched but required field missing |
52
+ | `ExtractionError` | No extractable text (scanned image) |
53
+
54
+ ### Parsing utilities
55
+
56
+ ```typescript
57
+ import { parseDate, parseCurrency, parseNum, parseDateRange } from "@parseo/shared";
58
+
59
+ parseDate("08/31/2024"); // "2024-08-31"
60
+ parseCurrency("$1,234.56"); // 1234.56
61
+ ```
62
+
63
+ ## License
64
+
65
+ MIT
@@ -0,0 +1,24 @@
1
+ import type { TextLine } from "./types.js";
2
+ export type FormatName = "smartlinx" | "credit-report" | "richer-values" | "form-1004mc" | "form-1073" | "wells-fargo" | "td-bank" | "chase" | "bank-of-america" | "navy-federal" | "third-federal" | "citibank" | "relay" | "grove-bank" | "capital-one" | "truist" | "pnc" | "discover" | "synovus";
3
+ export interface ClassifyResult {
4
+ /** Which parser family to use */
5
+ format: FormatName;
6
+ /** 1-based page number where the recognised content starts */
7
+ startPage: number;
8
+ /** Number of intro pages to strip (startPage − 1) */
9
+ skip: number;
10
+ /** Internal score — higher means more patterns matched */
11
+ confidence: number;
12
+ }
13
+ export type PackageName = "credit-reports" | "bank-statements" | "background-checks" | "appraisals";
14
+ /**
15
+ * Examine the extracted text lines page-by-page and determine which parser
16
+ * to use and how many intro pages to skip.
17
+ *
18
+ * @param scope — optional package name to limit classification to formats
19
+ * belonging to that package (e.g. `"bank-statements"`, `"credit-reports"`).
20
+ *
21
+ * Returns `null` if no known format is detected in the first N pages.
22
+ */
23
+ export declare function classifyDocument(lines: TextLine[], scope?: PackageName): ClassifyResult | null;
24
+ //# sourceMappingURL=classify.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"classify.d.ts","sourceRoot":"","sources":["../src/classify.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAI3C,MAAM,MAAM,UAAU,GAClB,WAAW,GACX,eAAe,GACf,eAAe,GACf,aAAa,GACb,WAAW,GACX,aAAa,GACb,SAAS,GACT,OAAO,GACP,iBAAiB,GACjB,cAAc,GACd,eAAe,GACf,UAAU,GACV,OAAO,GACP,YAAY,GACZ,aAAa,GACb,QAAQ,GACR,KAAK,GACL,UAAU,GACV,SAAS,CAAC;AAEd,MAAM,WAAW,cAAc;IAC7B,iCAAiC;IACjC,MAAM,EAAE,UAAU,CAAC;IACnB,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAC;IAClB,qDAAqD;IACrD,IAAI,EAAE,MAAM,CAAC;IACb,0DAA0D;IAC1D,UAAU,EAAE,MAAM,CAAC;CACpB;AAyaD,MAAM,MAAM,WAAW,GACnB,gBAAgB,GAChB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AA6BjB;;;;;;;;GAQG;AACH,wBAAgB,gBAAgB,CAC9B,KAAK,EAAE,QAAQ,EAAE,EACjB,KAAK,CAAC,EAAE,WAAW,GAClB,cAAc,GAAG,IAAI,CA2CvB"}
@@ -0,0 +1,457 @@
1
+ const profiles = [
2
+ // ── Form 1004 (URAR) ─────────────────────────────────────
3
+ {
4
+ name: "form-1004mc",
5
+ primary: [
6
+ /Uniform Residential Appraisal Report/i,
7
+ /Fannie Mae Form 1004\b/i,
8
+ /Freddie Mac Form 70\b/i,
9
+ /Form 1004\s*UAD/i,
10
+ ],
11
+ supporting: [
12
+ /Neighborhood Name/i,
13
+ /One-Unit Housing Trends/i,
14
+ /One-Unit Housing/i,
15
+ /PUD\s+HOA/i,
16
+ /Units\s+One\s+One with Accessory Unit/i,
17
+ /Type\s+Det\.\s+Att\./i,
18
+ /Finished area above grade contains/i,
19
+ /COST APPROACH TO VALUE/i,
20
+ /PROJECT INFORMATION FOR PUDs/i,
21
+ ],
22
+ exclude: [
23
+ /Individual Condominium Unit Appraisal Report/i,
24
+ /Form 1073/i,
25
+ ],
26
+ },
27
+ // ── Form 1073 (Condo) ────────────────────────────────────
28
+ {
29
+ name: "form-1073",
30
+ primary: [
31
+ /Individual Condominium Unit Appraisal Report/i,
32
+ /Fannie Mae Form 1073/i,
33
+ /Freddie Mac Form 465/i,
34
+ /Form 1073/i,
35
+ ],
36
+ supporting: [
37
+ /Unit\s*#/i,
38
+ /Project Name/i,
39
+ /Condominium Unit Housing Trends/i,
40
+ /Condominium Housing/i,
41
+ /PROJECT SITE/i,
42
+ /# of Elevators/i,
43
+ /conversion of existing building/i,
44
+ /HOA Mo\.\s*Assessment/i,
45
+ /Floor Location/i,
46
+ ],
47
+ exclude: [
48
+ /Uniform Residential Appraisal Report/i,
49
+ ],
50
+ },
51
+ // ── Richer Values (Renovation Analysis) ───────────────────
52
+ {
53
+ name: "richer-values",
54
+ primary: [
55
+ /Renovation Analysis/i,
56
+ /richervalues\.com/i,
57
+ ],
58
+ supporting: [
59
+ /Valuation Summary and Parameters/i,
60
+ /Hyper-Local Neighborhood/i,
61
+ /Budget Assessment/i,
62
+ /Estimated Valuation.*ARV/i,
63
+ /Distance-Based Comps/i,
64
+ /Renovation Strategies/i,
65
+ /Estimated As Is Market Value/i,
66
+ /Estimated ARV at Target Condition/i,
67
+ ],
68
+ exclude: [],
69
+ },
70
+ // ── Credit Reports (CreditXpert, PCB, Xactus) ────────────
71
+ {
72
+ name: "credit-report",
73
+ primary: [
74
+ /CreditXpert/i,
75
+ /MERGED INFILE CREDIT REPORT/i,
76
+ /PREMIUM CREDIT BUREAU/i,
77
+ /370 Reed Rd/i,
78
+ /800-243-0120/,
79
+ /Broomall,?\s*PA/i,
80
+ /Order Verifications/i,
81
+ ],
82
+ supporting: [
83
+ /FICO/i,
84
+ /Credit Summary/i,
85
+ /Credit Score Information/i,
86
+ /Credit History/i,
87
+ /Repositories.*(?:TUC|EXP|EQX)/i,
88
+ /Current score.*Potential score/i,
89
+ /ECOA KEY/i,
90
+ /SCORE MODELS/i,
91
+ /Client Code/i,
92
+ /Report ID/i,
93
+ /EQUIFAX|TRANSUNION|EXPERIAN/i,
94
+ ],
95
+ exclude: [],
96
+ },
97
+ // ── Wells Fargo (Bank Statements) ─────────────────────────
98
+ {
99
+ name: "wells-fargo",
100
+ primary: [
101
+ /Wells\s*Fargo/i,
102
+ /wellsfargo\.com/i,
103
+ ],
104
+ supporting: [
105
+ /Statement period activity summary/i,
106
+ /Transaction history/i,
107
+ /Beginning balance on/i,
108
+ /Ending balance on/i,
109
+ /Account number:/i,
110
+ /Business Checking/i,
111
+ /Monthly service fee summary/i,
112
+ /1-800-CALL-WELLS/i,
113
+ ],
114
+ exclude: [],
115
+ },
116
+ // ── Chase (Bank Statements) ───────────────────────────────
117
+ {
118
+ name: "chase",
119
+ primary: [
120
+ /JPMorgan Chase/i,
121
+ /Chase\.com/i,
122
+ /CHASE\b/,
123
+ ],
124
+ supporting: [
125
+ /CHECKING SUMMARY/i,
126
+ /DEPOSITS AND ADDITIONS/i,
127
+ /ELECTRONIC WITHDRAWALS/i,
128
+ /ATM & DEBIT CARD/i,
129
+ /DAILY ENDING BALANCE/i,
130
+ /Chase Business Complete/i,
131
+ /1-800-242-7338/,
132
+ /Account Number:/i,
133
+ ],
134
+ exclude: [],
135
+ },
136
+ // ── TD Bank (Bank Statements) ─────────────────────────────
137
+ {
138
+ name: "td-bank",
139
+ primary: [
140
+ /TD\s*Bank/i,
141
+ /tdbank\.com/i,
142
+ ],
143
+ supporting: [
144
+ /STATEMENT OF ACCOUNT/i,
145
+ /ACCOUNT SUMMARY/i,
146
+ /DAILY ACCOUNT ACTIVITY/i,
147
+ /Beginning Balance/i,
148
+ /Ending Balance/i,
149
+ /Primary Account #/i,
150
+ /Average Collected Balance/i,
151
+ /1-800-937-2000/,
152
+ ],
153
+ exclude: [],
154
+ },
155
+ // ── Bank of America (Bank Statements) ─────────────────────
156
+ {
157
+ name: "bank-of-america",
158
+ primary: [
159
+ /Bank of America/i,
160
+ /bankofamerica\.com/i,
161
+ /bofa\.com/i,
162
+ ],
163
+ supporting: [
164
+ /Beginning balance on/i,
165
+ /Ending balance on/i,
166
+ /Deposits and other credits/i,
167
+ /Withdrawals and other debits/i,
168
+ /Daily ledger balance/i,
169
+ /Service fees/i,
170
+ /Business Advantage/i,
171
+ /Account number:/i,
172
+ ],
173
+ exclude: [],
174
+ },
175
+ // ── Navy Federal (Bank Statements) ────────────────────────
176
+ {
177
+ name: "navy-federal",
178
+ primary: [
179
+ /Navy Federal/i,
180
+ /navyfederal\.org/i,
181
+ ],
182
+ supporting: [
183
+ /Statement of Account/i,
184
+ /Access No\./i,
185
+ /Summary of your deposit accounts/i,
186
+ /Business Checking/i,
187
+ /Mbr Business Savings/i,
188
+ /1-888-842-6328/,
189
+ /NCUA/i,
190
+ /Routing Number:/i,
191
+ ],
192
+ exclude: [],
193
+ },
194
+ // ── Relay (Bank Statements) ────────────────────────────────
195
+ {
196
+ name: "relay",
197
+ primary: [
198
+ /relayfi\.com/i,
199
+ /Relay Financials/i,
200
+ ],
201
+ supporting: [
202
+ /Thread Bank/i,
203
+ /Account Statement/i,
204
+ /Owners:/i,
205
+ /Opening Balance.*Closing Balance/i,
206
+ /1-888-205-9304/,
207
+ /Routing Number:/i,
208
+ ],
209
+ exclude: [],
210
+ },
211
+ // ── Citibank (Bank Statements) ─────────────────────────────
212
+ {
213
+ name: "citibank",
214
+ primary: [
215
+ /Citibank/i,
216
+ /CitiBusiness/i,
217
+ /CITIBANK,\s*N\.\s*A\./i,
218
+ ],
219
+ supporting: [
220
+ /CHECKING ACTIVITY/i,
221
+ /SAVINGS ACTIVITY/i,
222
+ /Beginning Balance:/i,
223
+ /Ending Balance:/i,
224
+ /SERVICE CHARGE SUMMARY/i,
225
+ /Citibusiness Service Center/i,
226
+ /877.*528.*0990/,
227
+ /Relationship Summary/i,
228
+ ],
229
+ exclude: [],
230
+ },
231
+ // ── Third Federal (HELOC Statements) ──────────────────────
232
+ {
233
+ name: "third-federal",
234
+ primary: [
235
+ /Third Federal/i,
236
+ /thirdfederal\.com/i,
237
+ /THIRD FEDERAL SAVINGS/i,
238
+ ],
239
+ supporting: [
240
+ /Equity Line of Credit Statement/i,
241
+ /Statement Closing Date/i,
242
+ /Credit Line/i,
243
+ /Principal Balance/i,
244
+ /Principal Amount/i,
245
+ /Finance Charge Calculation Summary/i,
246
+ /1-877-552-5659/,
247
+ /Account Summary/i,
248
+ /Payment Summary/i,
249
+ ],
250
+ exclude: [],
251
+ },
252
+ // ── PNC (Bank Statements) ─────────────────────────────────
253
+ {
254
+ name: "pnc",
255
+ primary: [
256
+ /PNC Bank/i,
257
+ /pnc\.com/i,
258
+ ],
259
+ supporting: [
260
+ /Business Checking/i,
261
+ /Balance Summary/i,
262
+ /Activity Detail/i,
263
+ /Deposits and Other Additions/i,
264
+ /Checks and Other Deductions/i,
265
+ /1-877-287-2654/,
266
+ /Pittsburgh, PA/i,
267
+ /Member FDIC/i,
268
+ ],
269
+ exclude: [],
270
+ },
271
+ // ── Truist (Bank Statements) ──────────────────────────────
272
+ {
273
+ name: "truist",
274
+ primary: [
275
+ /Truist/i,
276
+ /4TRUIST/,
277
+ /Truist\.com/i,
278
+ ],
279
+ supporting: [
280
+ /SIMPLE BUSINESS CHECKING/i,
281
+ /Account summary/i,
282
+ /Your previous balance/i,
283
+ /Your new balance/i,
284
+ /Deposits, credits and interest/i,
285
+ /Other withdrawals, debits/i,
286
+ /844.*487.*8478/,
287
+ /MEMBER FDIC/i,
288
+ ],
289
+ exclude: [],
290
+ },
291
+ // ── Capital One (Bank Statements) ─────────────────────────
292
+ {
293
+ name: "capital-one",
294
+ primary: [
295
+ /Capital One/i,
296
+ /capitalone\.com/i,
297
+ ],
298
+ supporting: [
299
+ /360 Performance Savings/i,
300
+ /STATEMENT PERIOD/i,
301
+ /TOTAL ENDING BALANCE/i,
302
+ /IN ALL ACCOUNTS/i,
303
+ /Account Summary/i,
304
+ /Cashflow Summary/i,
305
+ /1-888-464-0727/,
306
+ /P\.O\. Box 85123/i,
307
+ ],
308
+ exclude: [],
309
+ },
310
+ // ── Grove Bank (Bank Statements) ──────────────────────────
311
+ {
312
+ name: "grove-bank",
313
+ primary: [
314
+ /Grove\s*Bank/i,
315
+ /grovebankandtrust\.com/i,
316
+ ],
317
+ supporting: [
318
+ /CHECKING ACCOUNTS/i,
319
+ /Business Checking/i,
320
+ /Statement Dates/i,
321
+ /Previous Balance/i,
322
+ /Current Balance/i,
323
+ /Deposits\/Credits/i,
324
+ /Checks\/Debits/i,
325
+ /305-858-6666/,
326
+ /MEMBER FDIC/i,
327
+ ],
328
+ exclude: [],
329
+ },
330
+ // ── Discover (Bank Statements) ─────────────────────────────
331
+ {
332
+ name: "discover",
333
+ primary: [
334
+ /\bDiscover\b/,
335
+ /Discover\.com/i,
336
+ ],
337
+ supporting: [
338
+ /MONEY MARKET/i,
339
+ /ACCOUNT SUMMARY/i,
340
+ /ACCOUNT ACTIVITY/i,
341
+ /1-800-347-7000/,
342
+ /Deposits and Credits/i,
343
+ /Electronic Withdrawals/i,
344
+ /Annual Percentage Yield Earned/i,
345
+ /Interest Earned This Period/i,
346
+ /Salt Lake City, UT/i,
347
+ ],
348
+ exclude: [],
349
+ },
350
+ // ── Synovus (Bank Statements) ──────────────────────────────
351
+ {
352
+ name: "synovus",
353
+ primary: [
354
+ /P\.O\.\s*Box\s*2646/i,
355
+ /888-796-6887/,
356
+ ],
357
+ supporting: [
358
+ /Statement of Account/i,
359
+ /Pro Business Checking/i,
360
+ /Other Debits/i,
361
+ /Deposits\/Other Credits/i,
362
+ /Columbus,?\s*GA/i,
363
+ /Balance Summary/i,
364
+ /Direct inquiries to/i,
365
+ ],
366
+ exclude: [],
367
+ },
368
+ // ── SmartLinx (LexisNexis) ────────────────────────────────
369
+ {
370
+ name: "smartlinx",
371
+ primary: [
372
+ /SmartLinx/i,
373
+ /LexisNexis Risk Management/i,
374
+ ],
375
+ supporting: [
376
+ /Person Report/i,
377
+ /Search Terms:\s*SSN/i,
378
+ /Report created for/i,
379
+ /SSN Summary/i,
380
+ /Address Summary/i,
381
+ /Criminal Filings/i,
382
+ /Bankruptcy Filings/i,
383
+ /Judgment\s*[&/]\s*Lien Filings/i,
384
+ /Person Summary/i,
385
+ /LexID/i,
386
+ /At a Glance/i,
387
+ ],
388
+ exclude: [],
389
+ },
390
+ ];
391
+ const PACKAGE_FORMATS = {
392
+ "credit-reports": ["credit-report"],
393
+ "background-checks": ["smartlinx"],
394
+ "appraisals": ["richer-values", "form-1004mc", "form-1073"],
395
+ "bank-statements": [
396
+ "wells-fargo",
397
+ "td-bank",
398
+ "chase",
399
+ "bank-of-america",
400
+ "navy-federal",
401
+ "third-federal",
402
+ "citibank",
403
+ "relay",
404
+ "grove-bank",
405
+ "capital-one",
406
+ "truist",
407
+ "pnc",
408
+ "discover",
409
+ "synovus",
410
+ ],
411
+ };
412
+ // ── Classifier ────────────────────────────────────────────────
413
+ /** Maximum number of leading pages to scan before giving up */
414
+ const MAX_SCAN_PAGES = 5;
415
+ /**
416
+ * Examine the extracted text lines page-by-page and determine which parser
417
+ * to use and how many intro pages to skip.
418
+ *
419
+ * @param scope — optional package name to limit classification to formats
420
+ * belonging to that package (e.g. `"bank-statements"`, `"credit-reports"`).
421
+ *
422
+ * Returns `null` if no known format is detected in the first N pages.
423
+ */
424
+ export function classifyDocument(lines, scope) {
425
+ const pages = [...new Set(lines.map((l) => l.page))].sort((a, b) => a - b);
426
+ const scopeSet = scope ? new Set(PACKAGE_FORMATS[scope]) : null;
427
+ for (const page of pages.slice(0, MAX_SCAN_PAGES)) {
428
+ const pageText = lines
429
+ .filter((l) => l.page === page)
430
+ .map((l) => l.fullText)
431
+ .join("\n");
432
+ let best = null;
433
+ for (const profile of profiles) {
434
+ if (scopeSet && !scopeSet.has(profile.name))
435
+ continue;
436
+ if (profile.exclude.some((rx) => rx.test(pageText)))
437
+ continue;
438
+ const primaryHits = profile.primary.filter((rx) => rx.test(pageText)).length;
439
+ if (primaryHits === 0)
440
+ continue;
441
+ const supportHits = profile.supporting.filter((rx) => rx.test(pageText)).length;
442
+ const score = primaryHits * 10 + supportHits;
443
+ if (!best || score > best.score) {
444
+ best = { profile, score };
445
+ }
446
+ }
447
+ if (best) {
448
+ return {
449
+ format: best.profile.name,
450
+ startPage: page,
451
+ skip: page - 1,
452
+ confidence: best.score,
453
+ };
454
+ }
455
+ }
456
+ return null;
457
+ }
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Base error class for all parser errors.
3
+ * Consumers can catch `ParserError` to handle any parser failure.
4
+ */
5
+ export declare class ParserError extends Error {
6
+ constructor(message: string, options?: ErrorOptions);
7
+ }
8
+ /**
9
+ * Thrown when the input buffer is empty, too small, or not a valid PDF.
10
+ */
11
+ export declare class InvalidPDFError extends ParserError {
12
+ constructor(detail: string, options?: ErrorOptions);
13
+ }
14
+ /**
15
+ * Thrown when the PDF was read successfully but does not match the expected
16
+ * report format for the parser being used (e.g., passing a CreditXpert PDF
17
+ * to the Xactus parser).
18
+ */
19
+ export declare class UnrecognizedFormatError extends ParserError {
20
+ /** The parser that rejected the document */
21
+ readonly parser: string;
22
+ constructor(parser: string, detail: string, options?: ErrorOptions);
23
+ }
24
+ /**
25
+ * Thrown when the PDF matches the expected format but a required section
26
+ * is missing or could not be parsed (e.g., no borrower info found).
27
+ */
28
+ export declare class MissingSectionError extends ParserError {
29
+ /** The parser that encountered the issue */
30
+ readonly parser: string;
31
+ /** The section that was expected but not found */
32
+ readonly section: string;
33
+ constructor(parser: string, section: string, options?: ErrorOptions);
34
+ }
35
+ /**
36
+ * Thrown when PDF text extraction fails (corrupt PDF, encrypted, etc.).
37
+ */
38
+ export declare class ExtractionError extends ParserError {
39
+ constructor(detail: string, options?: ErrorOptions);
40
+ }
41
+ //# sourceMappingURL=errors.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,qBAAa,WAAY,SAAQ,KAAK;gBACxB,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY;CAIpD;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,WAAW;gBAClC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY;CAInD;AAED;;;;GAIG;AACH,qBAAa,uBAAwB,SAAQ,WAAW;IACtD,4CAA4C;IAC5C,SAAgB,MAAM,EAAE,MAAM,CAAC;gBAEnB,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY;CAKnE;AAED;;;GAGG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,4CAA4C;IAC5C,SAAgB,MAAM,EAAE,MAAM,CAAC;IAC/B,kDAAkD;IAClD,SAAgB,OAAO,EAAE,MAAM,CAAC;gBAEpB,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY;CAMpE;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,WAAW;gBAClC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY;CAInD"}
package/dist/errors.js ADDED
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Base error class for all parser errors.
3
+ * Consumers can catch `ParserError` to handle any parser failure.
4
+ */
5
+ export class ParserError extends Error {
6
+ constructor(message, options) {
7
+ super(message, options);
8
+ this.name = "ParserError";
9
+ }
10
+ }
11
+ /**
12
+ * Thrown when the input buffer is empty, too small, or not a valid PDF.
13
+ */
14
+ export class InvalidPDFError extends ParserError {
15
+ constructor(detail, options) {
16
+ super(`Invalid PDF: ${detail}`, options);
17
+ this.name = "InvalidPDFError";
18
+ }
19
+ }
20
+ /**
21
+ * Thrown when the PDF was read successfully but does not match the expected
22
+ * report format for the parser being used (e.g., passing a CreditXpert PDF
23
+ * to the Xactus parser).
24
+ */
25
+ export class UnrecognizedFormatError extends ParserError {
26
+ /** The parser that rejected the document */
27
+ parser;
28
+ constructor(parser, detail, options) {
29
+ super(`Unrecognized format for ${parser} parser: ${detail}`, options);
30
+ this.name = "UnrecognizedFormatError";
31
+ this.parser = parser;
32
+ }
33
+ }
34
+ /**
35
+ * Thrown when the PDF matches the expected format but a required section
36
+ * is missing or could not be parsed (e.g., no borrower info found).
37
+ */
38
+ export class MissingSectionError extends ParserError {
39
+ /** The parser that encountered the issue */
40
+ parser;
41
+ /** The section that was expected but not found */
42
+ section;
43
+ constructor(parser, section, options) {
44
+ super(`${parser}: required section "${section}" not found or empty`, options);
45
+ this.name = "MissingSectionError";
46
+ this.parser = parser;
47
+ this.section = section;
48
+ }
49
+ }
50
+ /**
51
+ * Thrown when PDF text extraction fails (corrupt PDF, encrypted, etc.).
52
+ */
53
+ export class ExtractionError extends ParserError {
54
+ constructor(detail, options) {
55
+ super(`PDF extraction failed: ${detail}`, options);
56
+ this.name = "ExtractionError";
57
+ }
58
+ }
@@ -0,0 +1,22 @@
1
+ import type { TextItem, TextLine } from "./types.js";
2
+ export declare function extractTextItems(buffer: Buffer): Promise<TextItem[]>;
3
+ export declare function formLines(items: TextItem[]): TextLine[];
4
+ export declare function extractLines(buffer: Buffer): Promise<TextLine[]>;
5
+ /** A small filled rectangle found in the PDF graphics layer. */
6
+ export interface FilledRect {
7
+ x: number;
8
+ y: number;
9
+ width: number;
10
+ height: number;
11
+ page: number;
12
+ }
13
+ /**
14
+ * Extract small filled rectangles from specific PDF pages.
15
+ * Useful for detecting checked checkboxes in flattened PDF forms where
16
+ * checkbox state is rendered as filled squares rather than form annotations.
17
+ */
18
+ export declare function extractFilledRects(buffer: Buffer, pages: number[], opts?: {
19
+ minSize?: number;
20
+ maxSize?: number;
21
+ }): Promise<FilledRect[]>;
22
+ //# sourceMappingURL=extract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../src/extract.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,QAAQ,EAAE,QAAQ,EAAe,MAAM,YAAY,CAAC;AAWlE,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,QAAQ,EAAE,CAAC,CAqErB;AAED,wBAAgB,SAAS,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE,CAgCvD;AA+BD,wBAAgB,YAAY,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAEhE;AAED,gEAAgE;AAChE,MAAM,WAAW,UAAU;IACzB,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;CACd;AAED;;;;GAIG;AACH,wBAAsB,kBAAkB,CACtC,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,MAAM,EAAE,EACf,IAAI,CAAC,EAAE;IAAE,OAAO,CAAC,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,GAC5C,OAAO,CAAC,UAAU,EAAE,CAAC,CAkFvB"}
@@ -0,0 +1,219 @@
1
+ import { getDocument, OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2
+ import { createRequire } from "module";
3
+ import { dirname, join } from "path";
4
+ import { InvalidPDFError, ExtractionError } from "./errors.js";
5
+ const require = createRequire(import.meta.url);
6
+ const pdfjsDir = dirname(require.resolve("pdfjs-dist/package.json"));
7
+ const standardFontDataUrl = join(pdfjsDir, "standard_fonts") + "/";
8
+ const cMapUrl = join(pdfjsDir, "cmaps") + "/";
9
+ const LINE_Y_TOLERANCE = 3;
10
+ const COLUMN_GAP_THRESHOLD = 15;
11
+ export async function extractTextItems(buffer) {
12
+ if (!buffer || buffer.length === 0) {
13
+ throw new InvalidPDFError("input buffer is empty");
14
+ }
15
+ if (buffer.length < 10 || buffer.subarray(0, 5).toString() !== "%PDF-") {
16
+ throw new InvalidPDFError("input does not start with a PDF header (%PDF-). Received " +
17
+ `${buffer.length} bytes starting with "${buffer.subarray(0, 20).toString().replace(/[^\x20-\x7E]/g, "?")}"`);
18
+ }
19
+ let doc;
20
+ try {
21
+ doc = await getDocument({
22
+ data: new Uint8Array(buffer),
23
+ standardFontDataUrl,
24
+ cMapUrl,
25
+ cMapPacked: true,
26
+ }).promise;
27
+ }
28
+ catch (err) {
29
+ const msg = err?.message || String(err);
30
+ if (msg.includes("password") || msg.includes("encrypted")) {
31
+ throw new InvalidPDFError("PDF is password-protected or encrypted", { cause: err });
32
+ }
33
+ throw new ExtractionError(msg, { cause: err });
34
+ }
35
+ const items = [];
36
+ for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
37
+ let page;
38
+ try {
39
+ page = await doc.getPage(pageNum);
40
+ }
41
+ catch (err) {
42
+ throw new ExtractionError(`failed to read page ${pageNum}: ${err?.message || err}`, { cause: err });
43
+ }
44
+ const viewport = page.getViewport({ scale: 1 });
45
+ const textContent = await page.getTextContent();
46
+ for (const item of textContent.items) {
47
+ const ti = item;
48
+ if (!ti.str || ti.str.trim() === "")
49
+ continue;
50
+ const tx = ti.transform;
51
+ const x = tx[4];
52
+ const yFromBottom = tx[5];
53
+ const y = viewport.height - yFromBottom;
54
+ const fontSize = Math.abs(tx[3]);
55
+ items.push({
56
+ text: ti.str,
57
+ x: Math.round(x * 100) / 100,
58
+ y: Math.round(y * 100) / 100,
59
+ width: Math.round((ti.width ?? 0) * 100) / 100,
60
+ height: Math.round(fontSize * 100) / 100,
61
+ fontName: ti.fontName ?? "",
62
+ page: pageNum,
63
+ });
64
+ }
65
+ }
66
+ if (items.length === 0) {
67
+ throw new ExtractionError("PDF has no extractable text. It may be a scanned image — OCR is required.");
68
+ }
69
+ return items;
70
+ }
71
+ export function formLines(items) {
72
+ const sorted = [...items].sort((a, b) => {
73
+ if (a.page !== b.page)
74
+ return a.page - b.page;
75
+ if (Math.abs(a.y - b.y) > LINE_Y_TOLERANCE)
76
+ return a.y - b.y;
77
+ return a.x - b.x;
78
+ });
79
+ const lines = [];
80
+ let currentLineItems = [];
81
+ let currentY = -Infinity;
82
+ let currentPage = -1;
83
+ for (const item of sorted) {
84
+ if (item.page !== currentPage ||
85
+ Math.abs(item.y - currentY) > LINE_Y_TOLERANCE) {
86
+ if (currentLineItems.length > 0) {
87
+ lines.push(buildLine(currentLineItems, currentY, currentPage));
88
+ }
89
+ currentLineItems = [item];
90
+ currentY = item.y;
91
+ currentPage = item.page;
92
+ }
93
+ else {
94
+ currentLineItems.push(item);
95
+ }
96
+ }
97
+ if (currentLineItems.length > 0) {
98
+ lines.push(buildLine(currentLineItems, currentY, currentPage));
99
+ }
100
+ return lines;
101
+ }
102
+ function buildLine(items, y, page) {
103
+ const sortedByX = [...items].sort((a, b) => a.x - b.x);
104
+ const segments = [];
105
+ let currentSeg = null;
106
+ for (const item of sortedByX) {
107
+ if (currentSeg === null) {
108
+ currentSeg = { text: item.text, x: item.x, width: item.width, height: item.height };
109
+ }
110
+ else {
111
+ const gap = item.x - (currentSeg.x + currentSeg.width);
112
+ if (gap > COLUMN_GAP_THRESHOLD) {
113
+ segments.push(currentSeg);
114
+ currentSeg = { text: item.text, x: item.x, width: item.width, height: item.height };
115
+ }
116
+ else {
117
+ const needsSpace = gap > 1 && !currentSeg.text.endsWith(" ");
118
+ currentSeg.text += (needsSpace ? " " : "") + item.text;
119
+ currentSeg.width = item.x + item.width - currentSeg.x;
120
+ currentSeg.height = Math.max(currentSeg.height, item.height);
121
+ }
122
+ }
123
+ }
124
+ if (currentSeg)
125
+ segments.push(currentSeg);
126
+ const fullText = segments.map((s) => s.text).join(" ");
127
+ return { segments, y, page, fullText };
128
+ }
129
+ export function extractLines(buffer) {
130
+ return extractTextItems(buffer).then(formLines);
131
+ }
132
+ /**
133
+ * Extract small filled rectangles from specific PDF pages.
134
+ * Useful for detecting checked checkboxes in flattened PDF forms where
135
+ * checkbox state is rendered as filled squares rather than form annotations.
136
+ */
137
+ export async function extractFilledRects(buffer, pages, opts) {
138
+ const minSize = opts?.minSize ?? 3;
139
+ const maxSize = opts?.maxSize ?? 15;
140
+ const doc = await getDocument({
141
+ data: new Uint8Array(buffer),
142
+ standardFontDataUrl,
143
+ cMapUrl,
144
+ cMapPacked: true,
145
+ }).promise;
146
+ const results = [];
147
+ for (const pageNum of pages) {
148
+ if (pageNum < 1 || pageNum > doc.numPages)
149
+ continue;
150
+ const page = await doc.getPage(pageNum);
151
+ const viewport = page.getViewport({ scale: 1 });
152
+ const opList = await page.getOperatorList();
153
+ let currentTransform = [1, 0, 0, 1, 0, 0];
154
+ const transformStack = [];
155
+ for (let i = 0; i < opList.fnArray.length; i++) {
156
+ const op = opList.fnArray[i];
157
+ const args = opList.argsArray[i];
158
+ if (op === OPS.save) {
159
+ transformStack.push([...currentTransform]);
160
+ }
161
+ else if (op === OPS.restore) {
162
+ currentTransform = transformStack.pop() || [1, 0, 0, 1, 0, 0];
163
+ }
164
+ else if (op === OPS.transform) {
165
+ const [a, b, c, d, e, f] = args;
166
+ const [ca, cb, cc, cd, ce, cf] = currentTransform;
167
+ currentTransform = [
168
+ ca * a + cc * b, cb * a + cd * b,
169
+ ca * c + cc * d, cb * c + cd * d,
170
+ ca * e + cc * f + ce, cb * e + cd * f + cf,
171
+ ];
172
+ }
173
+ else if (op === OPS.constructPath) {
174
+ const subOps = args[0];
175
+ const subArgs = args[1];
176
+ let argIdx = 0;
177
+ for (const subOp of subOps) {
178
+ if (subOp === OPS.rectangle) {
179
+ const rx = subArgs[argIdx], ry = subArgs[argIdx + 1];
180
+ const rw = subArgs[argIdx + 2], rh = subArgs[argIdx + 3];
181
+ const absW = Math.abs(rw), absH = Math.abs(rh);
182
+ if (absW >= minSize && absW <= maxSize && absH >= minSize && absH <= maxSize) {
183
+ const tx = currentTransform[0] * rx + currentTransform[2] * ry + currentTransform[4];
184
+ const ty = currentTransform[1] * rx + currentTransform[3] * ry + currentTransform[5];
185
+ results.push({
186
+ x: Math.round(tx * 100) / 100,
187
+ y: Math.round((viewport.height - ty) * 100) / 100,
188
+ width: Math.round(absW * 100) / 100,
189
+ height: Math.round(absH * 100) / 100,
190
+ page: pageNum,
191
+ });
192
+ }
193
+ argIdx += 4;
194
+ }
195
+ else if (subOp === OPS.moveTo || subOp === OPS.lineTo) {
196
+ argIdx += 2;
197
+ }
198
+ else if (subOp === OPS.curveTo) {
199
+ argIdx += 6;
200
+ }
201
+ else if (subOp === OPS.curveTo2 || subOp === OPS.curveTo3) {
202
+ argIdx += 4;
203
+ }
204
+ // closePath has no args
205
+ }
206
+ }
207
+ }
208
+ }
209
+ await doc.destroy();
210
+ // Deduplicate rects at the same position (checkboxes often have outline + fill)
211
+ const seen = new Set();
212
+ return results.filter((r) => {
213
+ const key = `${r.page}:${r.x.toFixed(0)}:${r.y.toFixed(0)}`;
214
+ if (seen.has(key))
215
+ return false;
216
+ seen.add(key);
217
+ return true;
218
+ });
219
+ }
@@ -0,0 +1,12 @@
1
+ export { extractTextItems, extractLines, formLines, extractFilledRects } from "./extract.js";
2
+ export type { FilledRect } from "./extract.js";
3
+ export { classifyDocument } from "./classify.js";
4
+ export type { FormatName, ClassifyResult, PackageName } from "./classify.js";
5
+ export type { TextItem, TextSegment, TextLine, DateString, DateRange } from "./types.js";
6
+ export { toBBox } from "./types.js";
7
+ export type { BoundingBox } from "./types.js";
8
+ export { resolvePathFromArgs } from "./cli.js";
9
+ export { ParserError, InvalidPDFError, UnrecognizedFormatError, MissingSectionError, ExtractionError, } from "./errors.js";
10
+ export { parseDate, parseDateRange, parseCurrency, parseNum, escapeRegex, cleanNumber, getSegmentNear, extractLabelValue, findLabelInText, isBulletLine, isNumberedEntry, parseBulletKeyValues, collectBulletItems, collectUntil, mapToColumns, findColumnHeaders, getSection, } from "./utils.js";
11
+ export type { Section } from "./utils.js";
12
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,YAAY,EAAE,SAAS,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAC7F,YAAY,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AACjD,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAC7E,YAAY,EAAE,QAAQ,EAAE,WAAW,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACzF,OAAO,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACpC,YAAY,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAC9C,OAAO,EAAE,mBAAmB,EAAE,MAAM,UAAU,CAAC;AAC/C,OAAO,EACL,WAAW,EACX,eAAe,EACf,uBAAuB,EACvB,mBAAmB,EACnB,eAAe,GAChB,MAAM,aAAa,CAAC;AACrB,OAAO,EACL,SAAS,EACT,cAAc,EACd,aAAa,EACb,QAAQ,EACR,WAAW,EACX,WAAW,EACX,cAAc,EACd,iBAAiB,EACjB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,oBAAoB,EACpB,kBAAkB,EAClB,YAAY,EACZ,YAAY,EACZ,iBAAiB,EACjB,UAAU,GACX,MAAM,YAAY,CAAC;AACpB,YAAY,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,6 @@
1
+ export { extractTextItems, extractLines, formLines, extractFilledRects } from "./extract.js";
2
+ export { classifyDocument } from "./classify.js";
3
+ export { toBBox } from "./types.js";
4
+ export { resolvePathFromArgs } from "./cli.js";
5
+ export { ParserError, InvalidPDFError, UnrecognizedFormatError, MissingSectionError, ExtractionError, } from "./errors.js";
6
+ export { parseDate, parseDateRange, parseCurrency, parseNum, escapeRegex, cleanNumber, getSegmentNear, extractLabelValue, findLabelInText, isBulletLine, isNumberedEntry, parseBulletKeyValues, collectBulletItems, collectUntil, mapToColumns, findColumnHeaders, getSection, } from "./utils.js";
@@ -0,0 +1,36 @@
1
+ export interface BoundingBox {
2
+ x: number;
3
+ y: number;
4
+ width: number;
5
+ height: number;
6
+ pageNumber: number;
7
+ }
8
+ export declare function toBBox(seg: TextSegment, line: TextLine): BoundingBox;
9
+ /** ISO partial date string (e.g. "2024-02-04", "2024-02", "2024") or null */
10
+ export type DateString = string | null;
11
+ export interface DateRange {
12
+ from: DateString;
13
+ to: DateString;
14
+ }
15
+ export interface TextItem {
16
+ text: string;
17
+ x: number;
18
+ y: number;
19
+ width: number;
20
+ height: number;
21
+ fontName: string;
22
+ page: number;
23
+ }
24
+ export interface TextSegment {
25
+ text: string;
26
+ x: number;
27
+ width: number;
28
+ height: number;
29
+ }
30
+ export interface TextLine {
31
+ segments: TextSegment[];
32
+ y: number;
33
+ page: number;
34
+ fullText: string;
35
+ }
36
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,WAAW;IAC1B,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,wBAAgB,MAAM,CAAC,GAAG,EAAE,WAAW,EAAE,IAAI,EAAE,QAAQ,GAAG,WAAW,CAQpE;AAID,6EAA6E;AAC7E,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,IAAI,CAAC;AAEvC,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,UAAU,CAAC;IACjB,EAAE,EAAE,UAAU,CAAC;CAChB;AAID,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,QAAQ;IACvB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,CAAC,EAAE,MAAM,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;CAClB"}
package/dist/types.js ADDED
@@ -0,0 +1,10 @@
1
+ // ── Bounding box ─────────────────────────────────────────────────────────────
2
+ export function toBBox(seg, line) {
3
+ return {
4
+ x: seg.x,
5
+ y: line.y,
6
+ width: seg.width,
7
+ height: seg.height,
8
+ pageNumber: line.page,
9
+ };
10
+ }
@@ -0,0 +1,65 @@
1
+ import type { TextLine } from "./types.js";
2
+ import type { DateString, DateRange } from "./types.js";
3
+ export { type DateString, type DateRange } from "./types.js";
4
+ /**
5
+ * Parse various date formats into ISO 8601 partial date strings.
6
+ * "02/04/2024" → "2024-02-04"
7
+ * "01/1989" → "1989-01"
8
+ * "2007" → "2007"
9
+ * "Current" → "present"
10
+ * "" → null
11
+ */
12
+ export declare function parseDate(raw: string): DateString;
13
+ /**
14
+ * Parse a date range string like "02/2010 - 02/2026" or "2021 - Current"
15
+ */
16
+ export declare function parseDateRange(raw: string): DateRange;
17
+ /**
18
+ * Parse currency string to cents-safe number.
19
+ * "$1,200,000" → 1200000
20
+ * "$5,250.00" → 5250
21
+ * "" → null
22
+ */
23
+ export declare function parseCurrency(raw: string): number | null;
24
+ /**
25
+ * Parse a numeric string to number or null.
26
+ */
27
+ export declare function parseNum(raw: string): number | null;
28
+ export declare function escapeRegex(s: string): string;
29
+ export declare function cleanNumber(s: string): string;
30
+ export interface Section {
31
+ name: string;
32
+ lines: TextLine[];
33
+ startIndex: number;
34
+ }
35
+ export declare function getSection(sections: Section[], name: string): Section | null;
36
+ /** Get text from segment at approximate column position */
37
+ export declare function getSegmentNear(line: TextLine, x: number, tolerance?: number): string;
38
+ /** Extract value after a label like "LexID 0065-8125-1321" */
39
+ export declare function extractLabelValue(line: TextLine, label: string): string | null;
40
+ /** Find a label in the full text and return its value */
41
+ export declare function findLabelInText(text: string, label: string): string;
42
+ /** Check if line starts with a number followed by a period */
43
+ export declare function isNumberedEntry(line: TextLine): number | null;
44
+ /** Check if a line starts with a bullet */
45
+ export declare function isBulletLine(line: TextLine): string | null;
46
+ /** Parse bullet lines into key-value pairs where format is "Key: Value" */
47
+ export declare function parseBulletKeyValues(lines: TextLine[]): Record<string, string>;
48
+ /** Collect all bullet-point items from consecutive lines */
49
+ export declare function collectBulletItems(lines: TextLine[], startIdx: number): {
50
+ items: string[];
51
+ nextIdx: number;
52
+ };
53
+ /** Collect lines until the next section header or numbered entry */
54
+ export declare function collectUntil(lines: TextLine[], startIdx: number, stopCondition: (line: TextLine, idx: number) => boolean): {
55
+ collected: TextLine[];
56
+ nextIdx: number;
57
+ };
58
+ /** Parse a line's segments into columns based on header x-positions */
59
+ export declare function mapToColumns(line: TextLine, columnXPositions: number[], tolerance?: number): string[];
60
+ /** Find a header row and extract column x-positions */
61
+ export declare function findColumnHeaders(lines: TextLine[], expectedHeaders: string[]): {
62
+ headerIndex: number;
63
+ columnXPositions: number[];
64
+ } | null;
65
+ //# sourceMappingURL=utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAC3C,OAAO,KAAK,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAExD,OAAO,EAAE,KAAK,UAAU,EAAE,KAAK,SAAS,EAAE,MAAM,YAAY,CAAC;AAI7D;;;;;;;GAOG;AACH,wBAAgB,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,CA2BjD;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,SAAS,CAWrD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAMxD;AAED;;GAEG;AACH,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAInD;AAID,wBAAgB,WAAW,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAE7C;AAED,wBAAgB,WAAW,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAE7C;AAID,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,QAAQ,EAAE,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,wBAAgB,UAAU,CAAC,QAAQ,EAAE,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CAU5E;AAID,2DAA2D;AAC3D,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,SAAK,GAAG,MAAM,CAOhF;AAED,+DAA+D;AAC/D,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,QAAQ,EACd,KAAK,EAAE,MAAM,GACZ,MAAM,GAAG,IAAI,CAef;AAED,yDAAyD;AACzD,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM,CAUnE;AAED,8DAA8D;AAC9D,wBAAgB,eAAe,CAAC,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,IAAI,CAG7D;AAED,2CAA2C;AAC3C,wBAAgB,YAAY,CAAC,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,IAAI,CAM1D;AAED,2EAA2E;AAC3E,wBAAgB,oBAAoB,CAClC,KAAK,EAAE,QAAQ,EAAE,GAChB,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAcxB;AAED,4DAA4D;AAC5D,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,MAAM,GACf;IAAE,KAAK,EAAE,MAAM,EAAE,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAatC;AAED,oEAAoE;AACpE,wBAAgB,YAAY,CAC1B,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,MAAM,EAChB,aAAa,EAAE,CAAC,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,MAAM,KAAK,OAAO,GACtD;IAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAS5C;AAED,uEAAuE;AACvE,wBAAgB,YAAY,CAC1B,IAAI,EAAE,QAAQ,EACd,gBAAgB,EAAE,MAAM,EAAE,EAC1B,SAAS,SAAK,GACb,MAAM,EAAE,CA6BV;AAED,uDAAuD;AACvD,wBAAgB,iBAAiB,CAC/B,KAAK,EAAE,QAAQ,EAAE,EACjB,eAAe,EAAE,MAAM,EAAE,GACxB;IAAE,WAAW,EAAE,MAAM,CAAC;IAAC,gBAAgB,EAAE,MAAM,EAAE,CAAA;CAAE,GAAG,IAAI,CAmB5D"}
package/dist/utils.js ADDED
@@ -0,0 +1,247 @@
1
+ // ── Date & currency parsing ───────────────────────────────────────────────────
2
+ /**
3
+ * Parse various date formats into ISO 8601 partial date strings.
4
+ * "02/04/2024" → "2024-02-04"
5
+ * "01/1989" → "1989-01"
6
+ * "2007" → "2007"
7
+ * "Current" → "present"
8
+ * "" → null
9
+ */
10
+ export function parseDate(raw) {
11
+ if (!raw)
12
+ return null;
13
+ const s = raw.trim();
14
+ if (!s || s === "--/----" || s === "--/--/----")
15
+ return null;
16
+ if (s.toLowerCase() === "current" || s.toLowerCase() === "present")
17
+ return "present";
18
+ // MM/DD/YYYY
19
+ const full = s.match(/^(\d{2})\/(\d{2})\/(\d{4})$/);
20
+ if (full)
21
+ return `${full[3]}-${full[1]}-${full[2]}`;
22
+ // MM/YYYY
23
+ const monthYear = s.match(/^(\d{2})\/(\d{4})$/);
24
+ if (monthYear)
25
+ return `${monthYear[2]}-${monthYear[1]}`;
26
+ // YYYY only
27
+ const yearOnly = s.match(/^(\d{4})$/);
28
+ if (yearOnly)
29
+ return yearOnly[1];
30
+ // Embedded MM/DD/YYYY
31
+ const embedded = s.match(/(\d{2})\/(\d{2})\/(\d{4})/);
32
+ if (embedded)
33
+ return `${embedded[3]}-${embedded[1]}-${embedded[2]}`;
34
+ // Embedded MM/YYYY
35
+ const embeddedMY = s.match(/(\d{2})\/(\d{4})/);
36
+ if (embeddedMY)
37
+ return `${embeddedMY[2]}-${embeddedMY[1]}`;
38
+ return null;
39
+ }
40
+ /**
41
+ * Parse a date range string like "02/2010 - 02/2026" or "2021 - Current"
42
+ */
43
+ export function parseDateRange(raw) {
44
+ if (!raw)
45
+ return { from: null, to: null };
46
+ const s = raw.trim();
47
+ const parts = s.split(/\s*-\s*/);
48
+ if (parts.length >= 2) {
49
+ return {
50
+ from: parseDate(parts[0]),
51
+ to: parseDate(parts.slice(1).join("-").trim()),
52
+ };
53
+ }
54
+ return { from: parseDate(s), to: null };
55
+ }
56
+ /**
57
+ * Parse currency string to cents-safe number.
58
+ * "$1,200,000" → 1200000
59
+ * "$5,250.00" → 5250
60
+ * "" → null
61
+ */
62
+ export function parseCurrency(raw) {
63
+ if (!raw)
64
+ return null;
65
+ const cleaned = raw.replace(/[$,\s]/g, "");
66
+ if (!cleaned)
67
+ return null;
68
+ const n = parseFloat(cleaned);
69
+ return isNaN(n) ? null : Math.round(n * 100) / 100;
70
+ }
71
+ /**
72
+ * Parse a numeric string to number or null.
73
+ */
74
+ export function parseNum(raw) {
75
+ if (!raw)
76
+ return null;
77
+ const n = parseFloat(raw.replace(/[^0-9.-]/g, ""));
78
+ return isNaN(n) ? null : n;
79
+ }
80
+ // ── String utilities ─────────────────────────────────────────────────────────
81
+ export function escapeRegex(s) {
82
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
83
+ }
84
+ export function cleanNumber(s) {
85
+ return s.replace(/[^0-9.-]/g, "");
86
+ }
87
+ export function getSection(sections, name) {
88
+ let best = null;
89
+ for (const s of sections) {
90
+ if (s.name === name) {
91
+ if (!best || s.lines.length > best.lines.length) {
92
+ best = s;
93
+ }
94
+ }
95
+ }
96
+ return best;
97
+ }
98
+ // ── Line parsing utilities ────────────────────────────────────────────────────
99
+ /** Get text from segment at approximate column position */
100
+ export function getSegmentNear(line, x, tolerance = 30) {
101
+ for (const seg of line.segments) {
102
+ if (Math.abs(seg.x - x) < tolerance) {
103
+ return seg.text.trim();
104
+ }
105
+ }
106
+ return "";
107
+ }
108
+ /** Extract value after a label like "LexID 0065-8125-1321" */
109
+ export function extractLabelValue(line, label) {
110
+ for (const seg of line.segments) {
111
+ const text = seg.text.trim();
112
+ if (text.startsWith(label)) {
113
+ const val = text.slice(label.length).trim();
114
+ if (val)
115
+ return val;
116
+ }
117
+ }
118
+ // Check across segments: label in one, value in next
119
+ for (let i = 0; i < line.segments.length - 1; i++) {
120
+ if (line.segments[i].text.trim() === label.replace(/:?\s*$/, "").trim()) {
121
+ return line.segments[i + 1].text.trim();
122
+ }
123
+ }
124
+ return null;
125
+ }
126
+ /** Find a label in the full text and return its value */
127
+ export function findLabelInText(text, label) {
128
+ const patterns = [
129
+ new RegExp(`${escapeRegex(label)}\\s*[:.]?\\s*(.+?)(?:\\s{2,}|$)`),
130
+ new RegExp(`${escapeRegex(label)}\\s*[:.]?\\s*(.+)`),
131
+ ];
132
+ for (const re of patterns) {
133
+ const m = text.match(re);
134
+ if (m?.[1])
135
+ return m[1].trim();
136
+ }
137
+ return "";
138
+ }
139
+ /** Check if line starts with a number followed by a period */
140
+ export function isNumberedEntry(line) {
141
+ const m = line.fullText.trim().match(/^(\d+)\.\s/);
142
+ return m ? parseInt(m[1], 10) : null;
143
+ }
144
+ /** Check if a line starts with a bullet */
145
+ export function isBulletLine(line) {
146
+ const text = line.fullText.trim();
147
+ if (text.startsWith("•") || text.startsWith("·") || text.startsWith("- ")) {
148
+ return text.replace(/^[•·\-]\s*/, "").trim();
149
+ }
150
+ return null;
151
+ }
152
+ /** Parse bullet lines into key-value pairs where format is "Key: Value" */
153
+ export function parseBulletKeyValues(lines) {
154
+ const result = {};
155
+ for (const line of lines) {
156
+ const bullet = isBulletLine(line);
157
+ if (bullet) {
158
+ const colonIdx = bullet.indexOf(":");
159
+ if (colonIdx > 0) {
160
+ const key = bullet.slice(0, colonIdx).trim();
161
+ const val = bullet.slice(colonIdx + 1).trim();
162
+ result[key] = val;
163
+ }
164
+ }
165
+ }
166
+ return result;
167
+ }
168
+ /** Collect all bullet-point items from consecutive lines */
169
+ export function collectBulletItems(lines, startIdx) {
170
+ const items = [];
171
+ let i = startIdx;
172
+ while (i < lines.length) {
173
+ const bullet = isBulletLine(lines[i]);
174
+ if (bullet) {
175
+ items.push(bullet);
176
+ i++;
177
+ }
178
+ else {
179
+ break;
180
+ }
181
+ }
182
+ return { items, nextIdx: i };
183
+ }
184
+ /** Collect lines until the next section header or numbered entry */
185
+ export function collectUntil(lines, startIdx, stopCondition) {
186
+ const collected = [];
187
+ let i = startIdx;
188
+ while (i < lines.length) {
189
+ if (stopCondition(lines[i], i))
190
+ break;
191
+ collected.push(lines[i]);
192
+ i++;
193
+ }
194
+ return { collected, nextIdx: i };
195
+ }
196
+ /** Parse a line's segments into columns based on header x-positions */
197
+ export function mapToColumns(line, columnXPositions, tolerance = 25) {
198
+ const result = new Array(columnXPositions.length).fill("");
199
+ for (const seg of line.segments) {
200
+ let bestCol = -1;
201
+ let bestDist = Infinity;
202
+ for (let c = 0; c < columnXPositions.length; c++) {
203
+ const dist = Math.abs(seg.x - columnXPositions[c]);
204
+ if (dist < tolerance && dist < bestDist) {
205
+ bestDist = dist;
206
+ bestCol = c;
207
+ }
208
+ }
209
+ if (bestCol >= 0) {
210
+ result[bestCol] = result[bestCol]
211
+ ? result[bestCol] + " " + seg.text.trim()
212
+ : seg.text.trim();
213
+ }
214
+ else {
215
+ // Try to assign to the closest column to the left
216
+ for (let c = columnXPositions.length - 1; c >= 0; c--) {
217
+ if (seg.x >= columnXPositions[c] - tolerance) {
218
+ result[c] = result[c]
219
+ ? result[c] + " " + seg.text.trim()
220
+ : seg.text.trim();
221
+ break;
222
+ }
223
+ }
224
+ }
225
+ }
226
+ return result;
227
+ }
228
+ /** Find a header row and extract column x-positions */
229
+ export function findColumnHeaders(lines, expectedHeaders) {
230
+ for (let i = 0; i < lines.length; i++) {
231
+ const line = lines[i];
232
+ const matchCount = expectedHeaders.filter((h) => line.fullText.includes(h)).length;
233
+ if (matchCount >= Math.ceil(expectedHeaders.length * 0.5)) {
234
+ const columnXPositions = expectedHeaders.map((h) => {
235
+ for (const seg of line.segments) {
236
+ if (seg.text.includes(h))
237
+ return seg.x;
238
+ }
239
+ return -1;
240
+ });
241
+ if (columnXPositions.every((x) => x >= 0)) {
242
+ return { headerIndex: i, columnXPositions };
243
+ }
244
+ }
245
+ }
246
+ return null;
247
+ }
package/package.json ADDED
@@ -0,0 +1,24 @@
1
+ {
2
+ "name": "@parseo/shared",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "main": "./dist/index.js",
6
+ "types": "./dist/index.d.ts",
7
+ "exports": {
8
+ ".": {
9
+ "types": "./dist/index.d.ts",
10
+ "import": "./dist/index.js"
11
+ }
12
+ },
13
+ "license": "MIT",
14
+ "publishConfig": {
15
+ "access": "public"
16
+ },
17
+ "files": ["dist", "!dist/**/cli.*"],
18
+ "scripts": {
19
+ "build": "tsc"
20
+ },
21
+ "dependencies": {
22
+ "pdfjs-dist": "^4.9.155"
23
+ }
24
+ }