hazo_files 1.4.1 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +183 -2
- package/dist/index.d.mts +507 -3
- package/dist/index.d.ts +507 -3
- package/dist/index.js +750 -13
- package/dist/index.mjs +735 -13
- package/dist/server/index.d.mts +512 -3
- package/dist/server/index.d.ts +512 -3
- package/dist/server/index.js +754 -15
- package/dist/server/index.mjs +739 -15
- package/package.json +2 -6
package/README.md
CHANGED
|
@@ -16,7 +16,10 @@ A powerful, modular file management package for Node.js and React applications w
|
|
|
16
16
|
- **Extraction Data Management**: Track and manage LLM-extracted metadata with merge strategies
|
|
17
17
|
- **LLM Integration**: Built-in support for hazo_llm_api document/image extraction
|
|
18
18
|
- **Upload + Extract Workflow**: Combined service for uploading files with automatic LLM extraction and naming
|
|
19
|
+
- **File Reference Tracking**: Multi-entity file references with orphan detection, soft delete, and lifecycle management
|
|
19
20
|
- **File Change Detection**: xxHash-based content hashing for efficient change detection
|
|
21
|
+
- **Content Tagging**: Optional LLM-based content classification at upload time or on-demand via `content_tag` field
|
|
22
|
+
- **Schema Migrations**: Built-in V2/V3 migration utilities for adding reference tracking and content tagging to existing databases
|
|
20
23
|
- **TypeScript**: Full type safety and IntelliSense support
|
|
21
24
|
- **OAuth Integration**: Built-in Google Drive OAuth authentication
|
|
22
25
|
- **Progress Tracking**: Upload/download progress callbacks
|
|
@@ -1106,11 +1109,17 @@ const extractionService = new LLMExtractionService((provider, options) => {
|
|
|
1106
1109
|
return createLLM({ provider, ...options });
|
|
1107
1110
|
}, 'gemini');
|
|
1108
1111
|
|
|
1109
|
-
// Create upload + extract service
|
|
1112
|
+
// Create upload + extract service (with optional content tag config)
|
|
1110
1113
|
const uploadExtract = new UploadExtractService(
|
|
1111
1114
|
trackedFileManager,
|
|
1112
1115
|
namingService,
|
|
1113
|
-
extractionService
|
|
1116
|
+
extractionService,
|
|
1117
|
+
{
|
|
1118
|
+
content_tag_set_by_llm: true,
|
|
1119
|
+
content_tag_prompt_area: 'classification',
|
|
1120
|
+
content_tag_prompt_key: 'classify_document',
|
|
1121
|
+
content_tag_prompt_return_fieldname: 'document_type',
|
|
1122
|
+
}
|
|
1114
1123
|
);
|
|
1115
1124
|
|
|
1116
1125
|
// Upload with extraction and naming convention
|
|
@@ -1138,6 +1147,8 @@ if (result.success) {
|
|
|
1138
1147
|
console.log('Uploaded to:', result.generatedPath);
|
|
1139
1148
|
// e.g., '/documents/2024/ACME/ACME_Q4_2024-12-09_001.pdf'
|
|
1140
1149
|
console.log('Extracted data:', result.extraction?.data);
|
|
1150
|
+
console.log('Content tag:', result.contentTag);
|
|
1151
|
+
// e.g., 'invoice', 'report', 'contract'
|
|
1141
1152
|
}
|
|
1142
1153
|
|
|
1143
1154
|
// Generate path preview without uploading
|
|
@@ -1192,6 +1203,169 @@ const autoResult = await extractionService.extract(
|
|
|
1192
1203
|
);
|
|
1193
1204
|
```
|
|
1194
1205
|
|
|
1206
|
+
## Content Tagging
|
|
1207
|
+
|
|
1208
|
+
Automatically classify uploaded files using LLM-based content analysis. The `content_tag` field stores a classification string (e.g., "invoice", "report", "contract") determined by an LLM prompt.
|
|
1209
|
+
|
|
1210
|
+
### Configuration
|
|
1211
|
+
|
|
1212
|
+
```typescript
|
|
1213
|
+
import type { ContentTagConfig } from 'hazo_files';
|
|
1214
|
+
|
|
1215
|
+
const contentTagConfig: ContentTagConfig = {
|
|
1216
|
+
content_tag_set_by_llm: true,
|
|
1217
|
+
content_tag_prompt_area: 'classification',
|
|
1218
|
+
content_tag_prompt_key: 'classify_document',
|
|
1219
|
+
content_tag_prompt_return_fieldname: 'document_type',
|
|
1220
|
+
content_tag_prompt_variables: { language: 'en' }, // optional
|
|
1221
|
+
};
|
|
1222
|
+
```
|
|
1223
|
+
|
|
1224
|
+
### Automatic Tagging at Upload
|
|
1225
|
+
|
|
1226
|
+
Pass `contentTagConfig` to `UploadExtractService` constructor (default for all uploads) or per-upload via options:
|
|
1227
|
+
|
|
1228
|
+
```typescript
|
|
1229
|
+
// Per-upload override
|
|
1230
|
+
const result = await uploadExtract.uploadWithExtract(buffer, 'file.pdf', {
|
|
1231
|
+
basePath: '/docs',
|
|
1232
|
+
contentTagConfig: {
|
|
1233
|
+
content_tag_set_by_llm: true,
|
|
1234
|
+
content_tag_prompt_area: 'classification',
|
|
1235
|
+
content_tag_prompt_key: 'classify_document',
|
|
1236
|
+
content_tag_prompt_return_fieldname: 'document_type',
|
|
1237
|
+
},
|
|
1238
|
+
});
|
|
1239
|
+
console.log(result.contentTag); // e.g., 'invoice'
|
|
1240
|
+
```
|
|
1241
|
+
|
|
1242
|
+
### Manual Tagging
|
|
1243
|
+
|
|
1244
|
+
Tag existing files by their database record ID:
|
|
1245
|
+
|
|
1246
|
+
```typescript
|
|
1247
|
+
const tagResult = await uploadExtract.tagFileContent('file-record-id');
|
|
1248
|
+
if (tagResult.success) {
|
|
1249
|
+
console.log('Tagged as:', tagResult.data);
|
|
1250
|
+
}
|
|
1251
|
+
```
|
|
1252
|
+
|
|
1253
|
+
### V3 Database Migration
|
|
1254
|
+
|
|
1255
|
+
If you have an existing `hazo_files` table, run the V3 migration to add the `content_tag` column:
|
|
1256
|
+
|
|
1257
|
+
```typescript
|
|
1258
|
+
import { migrateToV3, HAZO_FILES_MIGRATION_V3 } from 'hazo_files';
|
|
1259
|
+
|
|
1260
|
+
// Using the migration helper
|
|
1261
|
+
await migrateToV3(
|
|
1262
|
+
{ run: (sql) => db.run(sql) },
|
|
1263
|
+
'sqlite'
|
|
1264
|
+
);
|
|
1265
|
+
|
|
1266
|
+
// Or run statements manually
|
|
1267
|
+
for (const stmt of HAZO_FILES_MIGRATION_V3.sqlite.alterStatements) {
|
|
1268
|
+
try { await db.run(stmt); } catch { /* column exists */ }
|
|
1269
|
+
}
|
|
1270
|
+
```
|
|
1271
|
+
|
|
1272
|
+
New tables created with `HAZO_FILES_TABLE_SCHEMA` already include the `content_tag` column.
|
|
1273
|
+
|
|
1274
|
+
## File Reference Tracking
|
|
1275
|
+
|
|
1276
|
+
Track which entities (form fields, chat messages, etc.) reference each file. Multiple entities can reference the same file, enabling shared files without duplication.
|
|
1277
|
+
|
|
1278
|
+
### Adding and Removing References
|
|
1279
|
+
|
|
1280
|
+
```typescript
|
|
1281
|
+
import { TrackedFileManager } from 'hazo_files';
|
|
1282
|
+
|
|
1283
|
+
// Upload a file with an initial reference
|
|
1284
|
+
const result = await trackedManager.uploadFileWithRef(buffer, '/docs/report.pdf', {
|
|
1285
|
+
scope_id: 'workspace-123',
|
|
1286
|
+
uploaded_by: 'user-456',
|
|
1287
|
+
ref: {
|
|
1288
|
+
entity_type: 'form_field',
|
|
1289
|
+
entity_id: 'field-789',
|
|
1290
|
+
created_by: 'user-456',
|
|
1291
|
+
},
|
|
1292
|
+
});
|
|
1293
|
+
// result.data.file_id, result.data.ref_id
|
|
1294
|
+
|
|
1295
|
+
// Add another reference to the same file
|
|
1296
|
+
await trackedManager.addRef(fileId, {
|
|
1297
|
+
entity_type: 'chat_message',
|
|
1298
|
+
entity_id: 'msg-abc',
|
|
1299
|
+
});
|
|
1300
|
+
|
|
1301
|
+
// Remove a specific reference
|
|
1302
|
+
const { remaining_refs } = await trackedManager.removeRef(fileId, refId);
|
|
1303
|
+
|
|
1304
|
+
// Get file with status info
|
|
1305
|
+
const fileStatus = await trackedManager.getFileById(fileId);
|
|
1306
|
+
// { record, refs: FileRef[], is_orphaned: boolean }
|
|
1307
|
+
```
|
|
1308
|
+
|
|
1309
|
+
### Orphan Detection and Cleanup
|
|
1310
|
+
|
|
1311
|
+
```typescript
|
|
1312
|
+
// Find files with zero references
|
|
1313
|
+
const orphans = await trackedManager.findOrphanedFiles({
|
|
1314
|
+
olderThanMs: 7 * 24 * 60 * 60 * 1000, // 7 days old
|
|
1315
|
+
scope_id: 'workspace-123',
|
|
1316
|
+
});
|
|
1317
|
+
|
|
1318
|
+
// Clean up orphaned files (delete physical files + DB records)
|
|
1319
|
+
const { cleaned, errors } = await trackedManager.cleanupOrphanedFiles({
|
|
1320
|
+
olderThanMs: 30 * 24 * 60 * 60 * 1000,
|
|
1321
|
+
softDeleteOnly: false, // true to only mark as soft_deleted
|
|
1322
|
+
});
|
|
1323
|
+
|
|
1324
|
+
// Soft-delete a specific file
|
|
1325
|
+
await trackedManager.softDeleteFile(fileId);
|
|
1326
|
+
|
|
1327
|
+
// Verify physical file existence
|
|
1328
|
+
const exists = await trackedManager.verifyFileExistence(fileId);
|
|
1329
|
+
```
|
|
1330
|
+
|
|
1331
|
+
### Database Migration (Existing Databases)
|
|
1332
|
+
|
|
1333
|
+
If you have an existing `hazo_files` table, run the V2 migration to add reference tracking columns:
|
|
1334
|
+
|
|
1335
|
+
```typescript
|
|
1336
|
+
import { migrateToV2, backfillV2Defaults, HAZO_FILES_MIGRATION_V2 } from 'hazo_files';
|
|
1337
|
+
|
|
1338
|
+
// Using the migration helper
|
|
1339
|
+
await migrateToV2(
|
|
1340
|
+
{ run: (sql) => db.exec(sql) }, // SQLite
|
|
1341
|
+
'sqlite'
|
|
1342
|
+
);
|
|
1343
|
+
await backfillV2Defaults({ run: (sql) => db.exec(sql) }, 'sqlite');
|
|
1344
|
+
|
|
1345
|
+
// Or run statements manually
|
|
1346
|
+
for (const stmt of HAZO_FILES_MIGRATION_V2.sqlite.alterStatements) {
|
|
1347
|
+
try { await db.run(stmt); } catch { /* column exists */ }
|
|
1348
|
+
}
|
|
1349
|
+
for (const idx of HAZO_FILES_MIGRATION_V2.sqlite.indexes) {
|
|
1350
|
+
await db.run(idx);
|
|
1351
|
+
}
|
|
1352
|
+
```
|
|
1353
|
+
|
|
1354
|
+
New tables created with `HAZO_FILES_TABLE_SCHEMA` already include V2 columns. For V3 content tagging migration, see [Content Tagging](#content-tagging) above.
|
|
1355
|
+
|
|
1356
|
+
### Reference Tracking Types
|
|
1357
|
+
|
|
1358
|
+
```typescript
|
|
1359
|
+
import type {
|
|
1360
|
+
FileRef, // Individual reference from entity to file
|
|
1361
|
+
FileMetadataRecordV2, // Extended record with refs, status, scope
|
|
1362
|
+
FileWithStatus, // Rich view: record + parsed refs + is_orphaned
|
|
1363
|
+
FileStatus, // 'active' | 'orphaned' | 'soft_deleted' | 'missing'
|
|
1364
|
+
AddRefOptions, // Options for adding a reference
|
|
1365
|
+
RemoveRefsCriteria, // Criteria for bulk ref removal
|
|
1366
|
+
} from 'hazo_files';
|
|
1367
|
+
```
|
|
1368
|
+
|
|
1195
1369
|
## File Change Detection
|
|
1196
1370
|
|
|
1197
1371
|
Detect file content changes using fast xxHash hashing.
|
|
@@ -1236,6 +1410,13 @@ const hazoFiles = await createHazoFilesServer({
|
|
|
1236
1410
|
},
|
|
1237
1411
|
enableTracking: true,
|
|
1238
1412
|
llmFactory: (provider) => createLLM({ provider }),
|
|
1413
|
+
// Optional: enable automatic content tagging for all uploads
|
|
1414
|
+
defaultContentTagConfig: {
|
|
1415
|
+
content_tag_set_by_llm: true,
|
|
1416
|
+
content_tag_prompt_area: 'classification',
|
|
1417
|
+
content_tag_prompt_key: 'classify_document',
|
|
1418
|
+
content_tag_prompt_return_fieldname: 'document_type',
|
|
1419
|
+
},
|
|
1239
1420
|
});
|
|
1240
1421
|
|
|
1241
1422
|
// Access all services
|