@heripo/model 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.ko.md +284 -0
- package/README.md +284 -0
- package/dist/index.cjs +19 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +700 -0
- package/dist/index.d.ts +700 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/package.json +73 -0
package/README.md
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
# @heripo/model
|
|
2
|
+
|
|
3
|
+
> Document models and type definitions
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/@heripo/model)
|
|
6
|
+
[](https://nodejs.org/)
|
|
7
|
+
[](../../LICENSE)
|
|
8
|
+
|
|
9
|
+
**English** | [한국어](./README.ko.md)
|
|
10
|
+
|
|
11
|
+
> **Note**: Please check the [root README](../../README.md) first for project overview, installation instructions, and roadmap.
|
|
12
|
+
|
|
13
|
+
`@heripo/model` provides data models and TypeScript type definitions used in heripo engine.
|
|
14
|
+
|
|
15
|
+
## Table of Contents
|
|
16
|
+
|
|
17
|
+
- [Overview](#overview)
|
|
18
|
+
- [Installation](#installation)
|
|
19
|
+
- [Data Models](#data-models)
|
|
20
|
+
- [Usage](#usage)
|
|
21
|
+
- [License](#license)
|
|
22
|
+
|
|
23
|
+
## Overview
|
|
24
|
+
|
|
25
|
+
heripo engine's data processing pipeline:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
DoclingDocument (Docling SDK raw output)
|
|
29
|
+
↓
|
|
30
|
+
ProcessedDocument (LLM-optimized intermediate model)
|
|
31
|
+
↓
|
|
32
|
+
(Various models to be added per roadmap)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
`@heripo/model` defines data models currently used in the PDF parsing and document structure extraction stages. Various domain-specific models for archaeological data analysis, standardization, semantic modeling, etc. will be added in the future.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Install with npm
|
|
41
|
+
npm install @heripo/model
|
|
42
|
+
|
|
43
|
+
# Install with pnpm
|
|
44
|
+
pnpm add @heripo/model
|
|
45
|
+
|
|
46
|
+
# Install with yarn
|
|
47
|
+
yarn add @heripo/model
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Data Models
|
|
51
|
+
|
|
52
|
+
### DoclingDocument
|
|
53
|
+
|
|
54
|
+
Raw output format from Docling SDK.
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
import type { DoclingDocument } from '@heripo/model';
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Key Fields:**
|
|
61
|
+
|
|
62
|
+
- `type`: Document type (e.g., "pdf")
|
|
63
|
+
- `item_index`: Item index
|
|
64
|
+
- `json_content`: Document content (JSON object)
|
|
65
|
+
|
|
66
|
+
### ProcessedDocument
|
|
67
|
+
|
|
68
|
+
Intermediate data model optimized for LLM analysis.
|
|
69
|
+
|
|
70
|
+
```typescript
|
|
71
|
+
import type { ProcessedDocument } from '@heripo/model';
|
|
72
|
+
|
|
73
|
+
interface ProcessedDocument {
|
|
74
|
+
reportId: string; // Report ID
|
|
75
|
+
pageRangeMap: PageRange[]; // PDF page → document page mapping
|
|
76
|
+
chapters: Chapter[]; // Hierarchical chapter structure
|
|
77
|
+
images: ProcessedImage[]; // Extracted image metadata
|
|
78
|
+
tables: ProcessedTable[]; // Extracted table data
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Chapter
|
|
83
|
+
|
|
84
|
+
Hierarchical section structure of the document.
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
import type { Chapter } from '@heripo/model';
|
|
88
|
+
|
|
89
|
+
interface Chapter {
|
|
90
|
+
id: string; // Chapter ID
|
|
91
|
+
title: string; // Chapter title
|
|
92
|
+
level: number; // Hierarchy level (1, 2, 3, ...)
|
|
93
|
+
pageNo?: number; // Start page number
|
|
94
|
+
textBlocks: TextBlock[]; // Text blocks
|
|
95
|
+
imageIds: string[]; // Image ID references
|
|
96
|
+
tableIds: string[]; // Table ID references
|
|
97
|
+
children: Chapter[]; // Sub-chapters
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### TextBlock
|
|
102
|
+
|
|
103
|
+
Atomic text unit.
|
|
104
|
+
|
|
105
|
+
```typescript
|
|
106
|
+
import type { TextBlock } from '@heripo/model';
|
|
107
|
+
|
|
108
|
+
interface TextBlock {
|
|
109
|
+
text: string; // Text content
|
|
110
|
+
pageNo?: number; // Page number
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### ProcessedImage
|
|
115
|
+
|
|
116
|
+
Image metadata and reference information.
|
|
117
|
+
|
|
118
|
+
```typescript
|
|
119
|
+
import type { ProcessedImage } from '@heripo/model';
|
|
120
|
+
|
|
121
|
+
interface ProcessedImage {
|
|
122
|
+
id: string; // Image ID
|
|
123
|
+
caption?: Caption; // Caption (optional)
|
|
124
|
+
pdfPageNo?: number; // PDF page number
|
|
125
|
+
filePath: string; // Image file path
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### ProcessedTable
|
|
130
|
+
|
|
131
|
+
Table structure and data.
|
|
132
|
+
|
|
133
|
+
```typescript
|
|
134
|
+
import type { ProcessedTable } from '@heripo/model';
|
|
135
|
+
|
|
136
|
+
interface ProcessedTable {
|
|
137
|
+
id: string; // Table ID
|
|
138
|
+
caption?: Caption; // Caption (optional)
|
|
139
|
+
pdfPageNo?: number; // PDF page number
|
|
140
|
+
data: ProcessedTableCell[][]; // 2D grid data
|
|
141
|
+
numRows: number; // Row count
|
|
142
|
+
numCols: number; // Column count
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### ProcessedTableCell
|
|
147
|
+
|
|
148
|
+
Table cell metadata.
|
|
149
|
+
|
|
150
|
+
```typescript
|
|
151
|
+
import type { ProcessedTableCell } from '@heripo/model';
|
|
152
|
+
|
|
153
|
+
interface ProcessedTableCell {
|
|
154
|
+
text: string; // Cell text
|
|
155
|
+
rowspan: number; // Row span
|
|
156
|
+
colspan: number; // Column span
|
|
157
|
+
isHeader: boolean; // Is header cell
|
|
158
|
+
}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Caption
|
|
162
|
+
|
|
163
|
+
Image and table captions.
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
import type { Caption } from '@heripo/model';
|
|
167
|
+
|
|
168
|
+
interface Caption {
|
|
169
|
+
num?: number; // Caption number (e.g., 1 in "Figure 1")
|
|
170
|
+
fullText: string; // Full caption text
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### PageRange
|
|
175
|
+
|
|
176
|
+
PDF page to document page mapping.
|
|
177
|
+
|
|
178
|
+
```typescript
|
|
179
|
+
import type { PageRange } from '@heripo/model';
|
|
180
|
+
|
|
181
|
+
interface PageRange {
|
|
182
|
+
pdfPageNo: number; // PDF page number
|
|
183
|
+
pageNo: number; // Document logical page number
|
|
184
|
+
}
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Usage
|
|
188
|
+
|
|
189
|
+
### Reading ProcessedDocument
|
|
190
|
+
|
|
191
|
+
```typescript
|
|
192
|
+
import type { Chapter, ProcessedDocument } from '@heripo/model';
|
|
193
|
+
|
|
194
|
+
function analyzeDocument(doc: ProcessedDocument) {
|
|
195
|
+
console.log('Report ID:', doc.reportId);
|
|
196
|
+
|
|
197
|
+
// Iterate chapters
|
|
198
|
+
doc.chapters.forEach((chapter) => {
|
|
199
|
+
console.log(`Chapter: ${chapter.title} (level ${chapter.level})`);
|
|
200
|
+
console.log(` Text blocks: ${chapter.textBlocks.length}`);
|
|
201
|
+
console.log(` Images: ${chapter.imageIds.length}`);
|
|
202
|
+
console.log(` Tables: ${chapter.tableIds.length}`);
|
|
203
|
+
console.log(` Sub-chapters: ${chapter.children.length}`);
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
// Check images
|
|
207
|
+
doc.images.forEach((image) => {
|
|
208
|
+
console.log(`Image ${image.id}:`);
|
|
209
|
+
if (image.caption) {
|
|
210
|
+
console.log(` Caption: ${image.caption.fullText}`);
|
|
211
|
+
}
|
|
212
|
+
console.log(` Path: ${image.filePath}`);
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
// Check tables
|
|
216
|
+
doc.tables.forEach((table) => {
|
|
217
|
+
console.log(`Table ${table.id}:`);
|
|
218
|
+
console.log(` Size: ${table.numRows} x ${table.numCols}`);
|
|
219
|
+
if (table.caption) {
|
|
220
|
+
console.log(` Caption: ${table.caption.fullText}`);
|
|
221
|
+
}
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Recursive Chapter Traversal
|
|
227
|
+
|
|
228
|
+
```typescript
|
|
229
|
+
import type { Chapter } from '@heripo/model';
|
|
230
|
+
|
|
231
|
+
function traverseChapters(chapter: Chapter, depth: number = 0) {
|
|
232
|
+
const indent = ' '.repeat(depth);
|
|
233
|
+
console.log(`${indent}- ${chapter.title}`);
|
|
234
|
+
|
|
235
|
+
// Recursively traverse sub-chapters
|
|
236
|
+
chapter.children.forEach((child) => {
|
|
237
|
+
traverseChapters(child, depth + 1);
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Usage
|
|
242
|
+
doc.chapters.forEach((chapter) => traverseChapters(chapter));
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Type Guards
|
|
246
|
+
|
|
247
|
+
```typescript
|
|
248
|
+
import type { ProcessedImage, ProcessedTable } from '@heripo/model';
|
|
249
|
+
|
|
250
|
+
function hasCaption(
|
|
251
|
+
resource: ProcessedImage | ProcessedTable,
|
|
252
|
+
): resource is ProcessedImage | ProcessedTable {
|
|
253
|
+
return resource.caption !== undefined;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Usage
|
|
257
|
+
const resourcesWithCaptions = [...doc.images, ...doc.tables].filter(hasCaption);
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Related Packages
|
|
261
|
+
|
|
262
|
+
- [@heripo/pdf-parser](../pdf-parser) - PDF parsing and OCR
|
|
263
|
+
- [@heripo/document-processor](../document-processor) - Document structure analysis
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
This package is distributed under the [Apache License 2.0](../../LICENSE).
|
|
268
|
+
|
|
269
|
+
## Contributing
|
|
270
|
+
|
|
271
|
+
Contributions are always welcome! Please see the [Contributing Guide](../../CONTRIBUTING.md).
|
|
272
|
+
|
|
273
|
+
## Project-Wide Information
|
|
274
|
+
|
|
275
|
+
For project-wide information not covered in this package, see the [root README](../../README.md):
|
|
276
|
+
|
|
277
|
+
- **Citation and Attribution**: Academic citation (BibTeX) and attribution methods
|
|
278
|
+
- **Contributing Guidelines**: Development guidelines, commit rules, PR procedures
|
|
279
|
+
- **Community**: Issue tracker, discussions, security policy
|
|
280
|
+
- **Roadmap**: Project development plans
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
**heripo lab** | [GitHub](https://github.com/heripo-lab) | [heripo engine](https://github.com/heripo-lab/heripo-engine)
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __copyProps = (to, from, except, desc) => {
|
|
7
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
8
|
+
for (let key of __getOwnPropNames(from))
|
|
9
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
10
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
11
|
+
}
|
|
12
|
+
return to;
|
|
13
|
+
};
|
|
14
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
15
|
+
|
|
16
|
+
// src/index.ts
|
|
17
|
+
var index_exports = {};
|
|
18
|
+
module.exports = __toCommonJS(index_exports);
|
|
19
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
|