@parseo/appraisals 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -0
- package/dist/form-1004mc/extract-checkboxes.d.ts +43 -0
- package/dist/form-1004mc/extract-checkboxes.d.ts.map +1 -0
- package/dist/form-1004mc/extract-checkboxes.js +145 -0
- package/dist/form-1004mc/index.d.ts +3 -0
- package/dist/form-1004mc/index.d.ts.map +1 -0
- package/dist/form-1004mc/index.js +1 -0
- package/dist/form-1004mc/parse-page1.d.ts +8 -0
- package/dist/form-1004mc/parse-page1.d.ts.map +1 -0
- package/dist/form-1004mc/parse-page1.js +760 -0
- package/dist/form-1004mc/parse-sales.d.ts +6 -0
- package/dist/form-1004mc/parse-sales.d.ts.map +1 -0
- package/dist/form-1004mc/parse-sales.js +505 -0
- package/dist/form-1004mc/parser.d.ts +5 -0
- package/dist/form-1004mc/parser.d.ts.map +1 -0
- package/dist/form-1004mc/parser.js +437 -0
- package/dist/form-1004mc/types.d.ts +302 -0
- package/dist/form-1004mc/types.d.ts.map +1 -0
- package/dist/form-1004mc/types.js +1 -0
- package/dist/form-1073/index.d.ts +3 -0
- package/dist/form-1073/index.d.ts.map +1 -0
- package/dist/form-1073/index.js +1 -0
- package/dist/form-1073/parse-page1.d.ts +8 -0
- package/dist/form-1073/parse-page1.d.ts.map +1 -0
- package/dist/form-1073/parse-page1.js +704 -0
- package/dist/form-1073/parse-page2.d.ts +6 -0
- package/dist/form-1073/parse-page2.d.ts.map +1 -0
- package/dist/form-1073/parse-page2.js +438 -0
- package/dist/form-1073/parse-sales.d.ts +7 -0
- package/dist/form-1073/parse-sales.d.ts.map +1 -0
- package/dist/form-1073/parse-sales.js +477 -0
- package/dist/form-1073/parser.d.ts +5 -0
- package/dist/form-1073/parser.d.ts.map +1 -0
- package/dist/form-1073/parser.js +102 -0
- package/dist/form-1073/types.d.ts +300 -0
- package/dist/form-1073/types.d.ts.map +1 -0
- package/dist/form-1073/types.js +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/richer-values/index.d.ts +3 -0
- package/dist/richer-values/index.d.ts.map +1 -0
- package/dist/richer-values/index.js +1 -0
- package/dist/richer-values/parser.d.ts +5 -0
- package/dist/richer-values/parser.d.ts.map +1 -0
- package/dist/richer-values/parser.js +1067 -0
- package/dist/richer-values/types.d.ts +225 -0
- package/dist/richer-values/types.d.ts.map +1 -0
- package/dist/richer-values/types.js +1 -0
- package/package.json +24 -0
package/README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# @parseo/appraisals
|
|
2
|
+
|
|
3
|
+
Deterministic PDF parsers for appraisal reports. Supports Richer Values, Form 1004-MC (URAR), and Form 1073 (Condo).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @parseo/appraisals
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { richerValues, form1004MC, form1073 } from "@parseo/appraisals";
|
|
15
|
+
|
|
16
|
+
const report = await richerValues(buffer); // RicherValuesReport
|
|
17
|
+
const report = await form1004MC(buffer); // Form1004MCReport
|
|
18
|
+
const report = await form1073(buffer); // Form1073Report
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
| Format | Import name |
|
|
22
|
+
|---|---|
|
|
23
|
+
| Richer Values | `richerValues` |
|
|
24
|
+
| Form 1004-MC (URAR) | `form1004MC` |
|
|
25
|
+
| Form 1073 (Condo) | `form1073` |
|
|
26
|
+
|
|
27
|
+
## Data conventions
|
|
28
|
+
|
|
29
|
+
- **Dates**: ISO 8601 strings (`"2024-08-31"`) or `null`
|
|
30
|
+
- **Currency**: Plain numbers (`54961.89`, not `"$54,961.89"`)
|
|
31
|
+
- **Bounding boxes**: `{ x, y, width, height, pageNumber }` on every extracted field
|
|
32
|
+
|
|
33
|
+
## License
|
|
34
|
+
|
|
35
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Checkbox extraction from PDF vector graphics.
|
|
3
|
+
*
|
|
4
|
+
* In flattened TOTAL-generated PDFs, checkbox marks are rendered as
|
|
5
|
+
* constructPath operations (small ~8.4×8.4 pt shapes). An unchecked
|
|
6
|
+
* checkbox produces 1 path (the empty square outline). A checked
|
|
7
|
+
* checkbox produces 5 paths (square outline + 4 X-mark line segments).
|
|
8
|
+
*
|
|
9
|
+
* We exploit this by counting how many path shapes fall at each
|
|
10
|
+
* position — ≥ 3 means checked.
|
|
11
|
+
*/
|
|
12
|
+
export interface CheckedPosition {
|
|
13
|
+
/** x of the checkbox square (left edge) */
|
|
14
|
+
x: number;
|
|
15
|
+
/** y in top-down text coordinates (matches TextLine.y) */
|
|
16
|
+
y: number;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Extract all checked checkbox positions from a given PDF page.
|
|
20
|
+
*
|
|
21
|
+
* @returns Array of {x, y} positions where a checkbox is checked,
|
|
22
|
+
* in the same coordinate space as extracted TextLine objects.
|
|
23
|
+
*/
|
|
24
|
+
export declare function extractCheckedBoxes(buffer: Buffer, pageNum: number): Promise<CheckedPosition[]>;
|
|
25
|
+
/**
|
|
26
|
+
* Given a list of checked positions and a set of checkbox options
|
|
27
|
+
* at known x-positions on a given text-y row, return which option is checked.
|
|
28
|
+
*
|
|
29
|
+
* The checkbox square is rendered ~7pt above the text label (lower y value
|
|
30
|
+
* in top-down coordinates) and ~12pt to its left. Since rows are only
|
|
31
|
+
* ~11pt apart, we use a directional y-match: the checkbox must be above
|
|
32
|
+
* the text line (yDiff in [2, 12]) to avoid cross-row false positives.
|
|
33
|
+
*
|
|
34
|
+
* @param checked - Array of checked checkbox positions
|
|
35
|
+
* @param textY - The y-coordinate of the text row (from TextLine.y)
|
|
36
|
+
* @param options - Map of checkbox x-position to option label
|
|
37
|
+
* @param xTolerance - x-axis matching tolerance in points (default 6)
|
|
38
|
+
*/
|
|
39
|
+
export declare function resolveCheckbox(checked: CheckedPosition[], textY: number, options: {
|
|
40
|
+
x: number;
|
|
41
|
+
label: string;
|
|
42
|
+
}[], xTolerance?: number): string;
|
|
43
|
+
//# sourceMappingURL=extract-checkboxes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract-checkboxes.d.ts","sourceRoot":"","sources":["../../src/form-1004mc/extract-checkboxes.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAqBH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,CAAC,EAAE,MAAM,CAAC;IACV,0DAA0D;IAC1D,CAAC,EAAE,MAAM,CAAC;CACX;AAED;;;;;GAKG;AACH,wBAAsB,mBAAmB,CACvC,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,eAAe,EAAE,CAAC,CAgG5B;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,eAAe,CAC7B,OAAO,EAAE,eAAe,EAAE,EAC1B,KAAK,EAAE,MAAM,EACb,OAAO,EAAE;IAAE,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,EAAE,EACvC,UAAU,SAAI,GACb,MAAM,CAWR"}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Checkbox extraction from PDF vector graphics.
|
|
3
|
+
*
|
|
4
|
+
* In flattened TOTAL-generated PDFs, checkbox marks are rendered as
|
|
5
|
+
* constructPath operations (small ~8.4×8.4 pt shapes). An unchecked
|
|
6
|
+
* checkbox produces 1 path (the empty square outline). A checked
|
|
7
|
+
* checkbox produces 5 paths (square outline + 4 X-mark line segments).
|
|
8
|
+
*
|
|
9
|
+
* We exploit this by counting how many path shapes fall at each
|
|
10
|
+
* position — ≥ 3 means checked.
|
|
11
|
+
*/
|
|
12
|
+
import { OPS, getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
13
|
+
function multiply(a, b) {
|
|
14
|
+
return [
|
|
15
|
+
a[0] * b[0] + a[2] * b[1],
|
|
16
|
+
a[1] * b[0] + a[3] * b[1],
|
|
17
|
+
a[0] * b[2] + a[2] * b[3],
|
|
18
|
+
a[1] * b[2] + a[3] * b[3],
|
|
19
|
+
a[0] * b[4] + a[2] * b[5] + a[4],
|
|
20
|
+
a[1] * b[4] + a[3] * b[5] + a[5],
|
|
21
|
+
];
|
|
22
|
+
}
|
|
23
|
+
function applyM(m, x, y) {
|
|
24
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Extract all checked checkbox positions from a given PDF page.
|
|
28
|
+
*
|
|
29
|
+
* @returns Array of {x, y} positions where a checkbox is checked,
|
|
30
|
+
* in the same coordinate space as extracted TextLine objects.
|
|
31
|
+
*/
|
|
32
|
+
export async function extractCheckedBoxes(buffer, pageNum) {
|
|
33
|
+
const uint8 = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
34
|
+
const pdf = await getDocument({ data: uint8, useSystemFonts: true }).promise;
|
|
35
|
+
const page = await pdf.getPage(pageNum);
|
|
36
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
37
|
+
const pageHeight = viewport.height;
|
|
38
|
+
const ops = await page.getOperatorList();
|
|
39
|
+
const opsNames = {};
|
|
40
|
+
for (const [name, val] of Object.entries(OPS)) {
|
|
41
|
+
opsNames[val] = name;
|
|
42
|
+
}
|
|
43
|
+
// Track current transformation matrix with save/restore stack
|
|
44
|
+
let ctm = [1, 0, 0, 1, 0, 0];
|
|
45
|
+
const stack = [];
|
|
46
|
+
// Collect all small (~8×8 pt) path shapes and their page coordinates
|
|
47
|
+
const shapes = [];
|
|
48
|
+
for (let i = 0; i < ops.fnArray.length; i++) {
|
|
49
|
+
const fn = ops.fnArray[i];
|
|
50
|
+
const args = ops.argsArray[i];
|
|
51
|
+
const name = opsNames[fn];
|
|
52
|
+
if (name === "save") {
|
|
53
|
+
stack.push([...ctm]);
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
if (name === "restore") {
|
|
57
|
+
ctm = stack.pop() ?? [1, 0, 0, 1, 0, 0];
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
if (name === "transform") {
|
|
61
|
+
ctm = multiply(ctm, args);
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (name === "constructPath") {
|
|
65
|
+
const [opcodes, pathArgs] = args;
|
|
66
|
+
let ai = 0;
|
|
67
|
+
const points = [];
|
|
68
|
+
for (const op of opcodes) {
|
|
69
|
+
if (op === 13 /* moveTo */ || op === 14 /* lineTo */) {
|
|
70
|
+
const px = pathArgs[ai++];
|
|
71
|
+
const py = pathArgs[ai++];
|
|
72
|
+
points.push(applyM(ctm, px, py));
|
|
73
|
+
}
|
|
74
|
+
else if (op === 19 /* rectangle */) {
|
|
75
|
+
const rx = pathArgs[ai++], ry = pathArgs[ai++];
|
|
76
|
+
const rw = pathArgs[ai++], rh = pathArgs[ai++];
|
|
77
|
+
points.push(applyM(ctm, rx, ry));
|
|
78
|
+
points.push(applyM(ctm, rx + rw, ry + rh));
|
|
79
|
+
}
|
|
80
|
+
else if (op === 18 /* closePath */) {
|
|
81
|
+
// no args
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (points.length >= 2) {
|
|
85
|
+
const xs = points.map((p) => p[0]);
|
|
86
|
+
const ys = points.map((p) => p[1]);
|
|
87
|
+
const minX = Math.min(...xs);
|
|
88
|
+
const maxX = Math.max(...xs);
|
|
89
|
+
const minY = Math.min(...ys);
|
|
90
|
+
const maxY = Math.max(...ys);
|
|
91
|
+
const w = maxX - minX;
|
|
92
|
+
const h = maxY - minY;
|
|
93
|
+
// Checkbox squares are ~8.4×8.4 pt; allow 6-11 range
|
|
94
|
+
if (w >= 6 && w <= 11 && h >= 6 && h <= 11) {
|
|
95
|
+
// Convert from PDF bottom-up to top-down text coordinates
|
|
96
|
+
const textY = pageHeight - maxY;
|
|
97
|
+
shapes.push({ x: Math.round(minX * 10) / 10, y: Math.round(textY * 10) / 10 });
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
// Group shapes by position (within 3pt tolerance)
|
|
103
|
+
const groups = new Map();
|
|
104
|
+
for (const s of shapes) {
|
|
105
|
+
// Round to nearest 2pt grid to group nearby shapes
|
|
106
|
+
const key = `${Math.round(s.x / 2) * 2},${Math.round(s.y / 2) * 2}`;
|
|
107
|
+
groups.set(key, (groups.get(key) ?? 0) + 1);
|
|
108
|
+
}
|
|
109
|
+
// Checked checkboxes have >= 3 shapes at the same position
|
|
110
|
+
const checked = [];
|
|
111
|
+
for (const [key, count] of groups) {
|
|
112
|
+
if (count >= 3) {
|
|
113
|
+
const [x, y] = key.split(",").map(Number);
|
|
114
|
+
checked.push({ x, y });
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return checked;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Given a list of checked positions and a set of checkbox options
|
|
121
|
+
* at known x-positions on a given text-y row, return which option is checked.
|
|
122
|
+
*
|
|
123
|
+
* The checkbox square is rendered ~7pt above the text label (lower y value
|
|
124
|
+
* in top-down coordinates) and ~12pt to its left. Since rows are only
|
|
125
|
+
* ~11pt apart, we use a directional y-match: the checkbox must be above
|
|
126
|
+
* the text line (yDiff in [2, 12]) to avoid cross-row false positives.
|
|
127
|
+
*
|
|
128
|
+
* @param checked - Array of checked checkbox positions
|
|
129
|
+
* @param textY - The y-coordinate of the text row (from TextLine.y)
|
|
130
|
+
* @param options - Map of checkbox x-position to option label
|
|
131
|
+
* @param xTolerance - x-axis matching tolerance in points (default 6)
|
|
132
|
+
*/
|
|
133
|
+
export function resolveCheckbox(checked, textY, options, xTolerance = 6) {
|
|
134
|
+
for (const opt of options) {
|
|
135
|
+
// The checkbox square sits ~12pt to the left of the label text
|
|
136
|
+
const checkboxX = opt.x - 12.3;
|
|
137
|
+
const match = checked.find((c) => {
|
|
138
|
+
const yDiff = textY - c.y; // positive when checkbox is above text
|
|
139
|
+
return Math.abs(c.x - checkboxX) < xTolerance && yDiff >= 2 && yDiff <= 12;
|
|
140
|
+
});
|
|
141
|
+
if (match)
|
|
142
|
+
return opt.label;
|
|
143
|
+
}
|
|
144
|
+
return "";
|
|
145
|
+
}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export { parseForm1004MC, parseForm1004MCFromLines } from "./parser.js";
|
|
2
|
+
export type { Form1004MCReport, SubjectSection, ContractSection, NeighborhoodSection, SiteSection, ImprovementsSection, ComparableSale, SalesComparisonSection, ReconciliationSection, CostApproachSection, MarketConditionsAddendum, Form1004MCHeader, InventoryAnalysis, MedianSaleListData, MarketAnalysisText, CondoCoopProjects, AppraiserInfo, TimePeriodRow, Trend, } from "./types.js";
|
|
3
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/form-1004mc/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AACxE,YAAY,EACV,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,WAAW,EACX,mBAAmB,EACnB,cAAc,EACd,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,wBAAwB,EACxB,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,EAClB,kBAAkB,EAClB,iBAAiB,EACjB,aAAa,EACb,aAAa,EACb,KAAK,GACN,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { parseForm1004MC, parseForm1004MCFromLines } from "./parser.js";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { TextLine } from "@parseo/shared";
|
|
2
|
+
import type { SubjectSection, ContractSection, NeighborhoodSection, SiteSection, ImprovementsSection } from "./types.js";
|
|
3
|
+
export declare function parseSubjectSection(lines: TextLine[]): SubjectSection;
|
|
4
|
+
export declare function parseContractSection(lines: TextLine[]): ContractSection;
|
|
5
|
+
export declare function parseNeighborhoodSection(lines: TextLine[]): NeighborhoodSection;
|
|
6
|
+
export declare function parseSiteSection(lines: TextLine[]): SiteSection;
|
|
7
|
+
export declare function parseImprovementsSection(lines: TextLine[]): ImprovementsSection;
|
|
8
|
+
//# sourceMappingURL=parse-page1.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parse-page1.d.ts","sourceRoot":"","sources":["../../src/form-1004mc/parse-page1.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAe,MAAM,gBAAgB,CAAC;AAC5D,OAAO,KAAK,EACV,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,WAAW,EACX,mBAAmB,EACpB,MAAM,YAAY,CAAC;AAiDpB,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,cAAc,CAuGrE;AAID,wBAAgB,oBAAoB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,eAAe,CA6CvE;AAID,wBAAgB,wBAAwB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,mBAAmB,CA0I/E;AAID,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,WAAW,CAkE/D;AAID,wBAAgB,wBAAwB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,mBAAmB,CAgN/E"}
|