@heripo/model 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,284 @@
1
+ # @heripo/model
2
+
3
+ > Document models and type definitions
4
+
5
+ [![npm version](https://img.shields.io/npm/v/@heripo/model.svg)](https://www.npmjs.com/package/@heripo/model)
6
+ [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D22-339933?logo=node.js&logoColor=white)](https://nodejs.org/)
7
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](../../LICENSE)
8
+
9
+ **English** | [한국어](./README.ko.md)
10
+
11
+ > **Note**: Please check the [root README](../../README.md) first for project overview, installation instructions, and roadmap.
12
+
13
+ `@heripo/model` provides data models and TypeScript type definitions used in heripo engine.
14
+
15
+ ## Table of Contents
16
+
17
+ - [Overview](#overview)
18
+ - [Installation](#installation)
19
+ - [Data Models](#data-models)
20
+ - [Usage](#usage)
21
+ - [License](#license)
22
+
23
+ ## Overview
24
+
25
+ heripo engine's data processing pipeline:
26
+
27
+ ```
28
+ DoclingDocument (Docling SDK raw output)
29
+
30
+ ProcessedDocument (LLM-optimized intermediate model)
31
+
32
+ (Various models to be added per roadmap)
33
+ ```
34
+
35
+ `@heripo/model` defines data models currently used in the PDF parsing and document structure extraction stages. Various domain-specific models for archaeological data analysis, standardization, semantic modeling, etc. will be added in the future.
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ # Install with npm
41
+ npm install @heripo/model
42
+
43
+ # Install with pnpm
44
+ pnpm add @heripo/model
45
+
46
+ # Install with yarn
47
+ yarn add @heripo/model
48
+ ```
49
+
50
+ ## Data Models
51
+
52
+ ### DoclingDocument
53
+
54
+ Raw output format from Docling SDK.
55
+
56
+ ```typescript
57
+ import type { DoclingDocument } from '@heripo/model';
58
+ ```
59
+
60
+ **Key Fields:**
61
+
62
+ - `type`: Document type (e.g., "pdf")
63
+ - `item_index`: Item index
64
+ - `json_content`: Document content (JSON object)
65
+
66
+ ### ProcessedDocument
67
+
68
+ Intermediate data model optimized for LLM analysis.
69
+
70
+ ```typescript
71
+ import type { ProcessedDocument } from '@heripo/model';
72
+
73
+ interface ProcessedDocument {
74
+ reportId: string; // Report ID
75
+ pageRangeMap: PageRange[]; // PDF page → document page mapping
76
+ chapters: Chapter[]; // Hierarchical chapter structure
77
+ images: ProcessedImage[]; // Extracted image metadata
78
+ tables: ProcessedTable[]; // Extracted table data
79
+ }
80
+ ```
81
+
82
+ ### Chapter
83
+
84
+ Hierarchical section structure of the document.
85
+
86
+ ```typescript
87
+ import type { Chapter } from '@heripo/model';
88
+
89
+ interface Chapter {
90
+ id: string; // Chapter ID
91
+ title: string; // Chapter title
92
+ level: number; // Hierarchy level (1, 2, 3, ...)
93
+ pageNo?: number; // Start page number
94
+ textBlocks: TextBlock[]; // Text blocks
95
+ imageIds: string[]; // Image ID references
96
+ tableIds: string[]; // Table ID references
97
+ children: Chapter[]; // Sub-chapters
98
+ }
99
+ ```
100
+
101
+ ### TextBlock
102
+
103
+ Atomic text unit.
104
+
105
+ ```typescript
106
+ import type { TextBlock } from '@heripo/model';
107
+
108
+ interface TextBlock {
109
+ text: string; // Text content
110
+ pageNo?: number; // Page number
111
+ }
112
+ ```
113
+
114
+ ### ProcessedImage
115
+
116
+ Image metadata and reference information.
117
+
118
+ ```typescript
119
+ import type { ProcessedImage } from '@heripo/model';
120
+
121
+ interface ProcessedImage {
122
+ id: string; // Image ID
123
+ caption?: Caption; // Caption (optional)
124
+ pdfPageNo?: number; // PDF page number
125
+ filePath: string; // Image file path
126
+ }
127
+ ```
128
+
129
+ ### ProcessedTable
130
+
131
+ Table structure and data.
132
+
133
+ ```typescript
134
+ import type { ProcessedTable } from '@heripo/model';
135
+
136
+ interface ProcessedTable {
137
+ id: string; // Table ID
138
+ caption?: Caption; // Caption (optional)
139
+ pdfPageNo?: number; // PDF page number
140
+ data: ProcessedTableCell[][]; // 2D grid data
141
+ numRows: number; // Row count
142
+ numCols: number; // Column count
143
+ }
144
+ ```
145
+
146
+ ### ProcessedTableCell
147
+
148
+ Table cell metadata.
149
+
150
+ ```typescript
151
+ import type { ProcessedTableCell } from '@heripo/model';
152
+
153
+ interface ProcessedTableCell {
154
+ text: string; // Cell text
155
+ rowspan: number; // Row span
156
+ colspan: number; // Column span
157
+ isHeader: boolean; // Is header cell
158
+ }
159
+ ```
160
+
161
+ ### Caption
162
+
163
+ Image and table captions.
164
+
165
+ ```typescript
166
+ import type { Caption } from '@heripo/model';
167
+
168
+ interface Caption {
169
+ num?: number; // Caption number (e.g., 1 in "Figure 1")
170
+ fullText: string; // Full caption text
171
+ }
172
+ ```
173
+
174
+ ### PageRange
175
+
176
+ PDF page to document page mapping.
177
+
178
+ ```typescript
179
+ import type { PageRange } from '@heripo/model';
180
+
181
+ interface PageRange {
182
+ pdfPageNo: number; // PDF page number
183
+ pageNo: number; // Document logical page number
184
+ }
185
+ ```
186
+
187
+ ## Usage
188
+
189
+ ### Reading ProcessedDocument
190
+
191
+ ```typescript
192
+ import type { Chapter, ProcessedDocument } from '@heripo/model';
193
+
194
+ function analyzeDocument(doc: ProcessedDocument) {
195
+ console.log('Report ID:', doc.reportId);
196
+
197
+ // Iterate chapters
198
+ doc.chapters.forEach((chapter) => {
199
+ console.log(`Chapter: ${chapter.title} (level ${chapter.level})`);
200
+ console.log(` Text blocks: ${chapter.textBlocks.length}`);
201
+ console.log(` Images: ${chapter.imageIds.length}`);
202
+ console.log(` Tables: ${chapter.tableIds.length}`);
203
+ console.log(` Sub-chapters: ${chapter.children.length}`);
204
+ });
205
+
206
+ // Check images
207
+ doc.images.forEach((image) => {
208
+ console.log(`Image ${image.id}:`);
209
+ if (image.caption) {
210
+ console.log(` Caption: ${image.caption.fullText}`);
211
+ }
212
+ console.log(` Path: ${image.filePath}`);
213
+ });
214
+
215
+ // Check tables
216
+ doc.tables.forEach((table) => {
217
+ console.log(`Table ${table.id}:`);
218
+ console.log(` Size: ${table.numRows} x ${table.numCols}`);
219
+ if (table.caption) {
220
+ console.log(` Caption: ${table.caption.fullText}`);
221
+ }
222
+ });
223
+ }
224
+ ```
225
+
226
+ ### Recursive Chapter Traversal
227
+
228
+ ```typescript
229
+ import type { Chapter } from '@heripo/model';
230
+
231
+ function traverseChapters(chapter: Chapter, depth: number = 0) {
232
+ const indent = ' '.repeat(depth);
233
+ console.log(`${indent}- ${chapter.title}`);
234
+
235
+ // Recursively traverse sub-chapters
236
+ chapter.children.forEach((child) => {
237
+ traverseChapters(child, depth + 1);
238
+ });
239
+ }
240
+
241
+ // Usage
242
+ doc.chapters.forEach((chapter) => traverseChapters(chapter));
243
+ ```
244
+
245
+ ### Type Guards
246
+
247
+ ```typescript
248
+ import type { ProcessedImage, ProcessedTable } from '@heripo/model';
249
+
250
+ function hasCaption(
251
+ resource: ProcessedImage | ProcessedTable,
252
+ ): resource is ProcessedImage | ProcessedTable {
253
+ return resource.caption !== undefined;
254
+ }
255
+
256
+ // Usage
257
+ const resourcesWithCaptions = [...doc.images, ...doc.tables].filter(hasCaption);
258
+ ```
259
+
260
+ ## Related Packages
261
+
262
+ - [@heripo/pdf-parser](../pdf-parser) - PDF parsing and OCR
263
+ - [@heripo/document-processor](../document-processor) - Document structure analysis
264
+
265
+ ## License
266
+
267
+ This package is distributed under the [Apache License 2.0](../../LICENSE).
268
+
269
+ ## Contributing
270
+
271
+ Contributions are always welcome! Please see the [Contributing Guide](../../CONTRIBUTING.md).
272
+
273
+ ## Project-Wide Information
274
+
275
+ For project-wide information not covered in this package, see the [root README](../../README.md):
276
+
277
+ - **Citation and Attribution**: Academic citation (BibTeX) and attribution methods
278
+ - **Contributing Guidelines**: Development guidelines, commit rules, PR procedures
279
+ - **Community**: Issue tracker, discussions, security policy
280
+ - **Roadmap**: Project development plans
281
+
282
+ ---
283
+
284
+ **heripo lab** | [GitHub](https://github.com/heripo-lab) | [heripo engine](https://github.com/heripo-lab/heripo-engine)
package/dist/index.cjs ADDED
@@ -0,0 +1,19 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __copyProps = (to, from, except, desc) => {
7
+ if (from && typeof from === "object" || typeof from === "function") {
8
+ for (let key of __getOwnPropNames(from))
9
+ if (!__hasOwnProp.call(to, key) && key !== except)
10
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
11
+ }
12
+ return to;
13
+ };
14
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
15
+
16
+ // src/index.ts
17
+ var index_exports = {};
18
+ module.exports = __toCommonJS(index_exports);
19
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}