hazo_files 1.4.1 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -16,7 +16,10 @@ A powerful, modular file management package for Node.js and React applications w
16
16
  - **Extraction Data Management**: Track and manage LLM-extracted metadata with merge strategies
17
17
  - **LLM Integration**: Built-in support for hazo_llm_api document/image extraction
18
18
  - **Upload + Extract Workflow**: Combined service for uploading files with automatic LLM extraction and naming
19
+ - **File Reference Tracking**: Multi-entity file references with orphan detection, soft delete, and lifecycle management
19
20
  - **File Change Detection**: xxHash-based content hashing for efficient change detection
21
+ - **Content Tagging**: Optional LLM-based content classification at upload time or on-demand via `content_tag` field
22
+ - **Schema Migrations**: Built-in V2/V3 migration utilities for adding reference tracking and content tagging to existing databases
20
23
  - **TypeScript**: Full type safety and IntelliSense support
21
24
  - **OAuth Integration**: Built-in Google Drive OAuth authentication
22
25
  - **Progress Tracking**: Upload/download progress callbacks
@@ -1106,11 +1109,17 @@ const extractionService = new LLMExtractionService((provider, options) => {
1106
1109
  return createLLM({ provider, ...options });
1107
1110
  }, 'gemini');
1108
1111
 
1109
- // Create upload + extract service
1112
+ // Create upload + extract service (with optional content tag config)
1110
1113
  const uploadExtract = new UploadExtractService(
1111
1114
  trackedFileManager,
1112
1115
  namingService,
1113
- extractionService
1116
+ extractionService,
1117
+ {
1118
+ content_tag_set_by_llm: true,
1119
+ content_tag_prompt_area: 'classification',
1120
+ content_tag_prompt_key: 'classify_document',
1121
+ content_tag_prompt_return_fieldname: 'document_type',
1122
+ }
1114
1123
  );
1115
1124
 
1116
1125
  // Upload with extraction and naming convention
@@ -1138,6 +1147,8 @@ if (result.success) {
1138
1147
  console.log('Uploaded to:', result.generatedPath);
1139
1148
  // e.g., '/documents/2024/ACME/ACME_Q4_2024-12-09_001.pdf'
1140
1149
  console.log('Extracted data:', result.extraction?.data);
1150
+ console.log('Content tag:', result.contentTag);
1151
+ // e.g., 'invoice', 'report', 'contract'
1141
1152
  }
1142
1153
 
1143
1154
  // Generate path preview without uploading
@@ -1192,6 +1203,169 @@ const autoResult = await extractionService.extract(
1192
1203
  );
1193
1204
  ```
1194
1205
 
1206
+ ## Content Tagging
1207
+
1208
+ Automatically classify uploaded files using LLM-based content analysis. The `content_tag` field stores a classification string (e.g., "invoice", "report", "contract") determined by an LLM prompt.
1209
+
1210
+ ### Configuration
1211
+
1212
+ ```typescript
1213
+ import type { ContentTagConfig } from 'hazo_files';
1214
+
1215
+ const contentTagConfig: ContentTagConfig = {
1216
+ content_tag_set_by_llm: true,
1217
+ content_tag_prompt_area: 'classification',
1218
+ content_tag_prompt_key: 'classify_document',
1219
+ content_tag_prompt_return_fieldname: 'document_type',
1220
+ content_tag_prompt_variables: { language: 'en' }, // optional
1221
+ };
1222
+ ```
1223
+
1224
+ ### Automatic Tagging at Upload
1225
+
1226
+ Pass `contentTagConfig` to `UploadExtractService` constructor (default for all uploads) or per-upload via options:
1227
+
1228
+ ```typescript
1229
+ // Per-upload override
1230
+ const result = await uploadExtract.uploadWithExtract(buffer, 'file.pdf', {
1231
+ basePath: '/docs',
1232
+ contentTagConfig: {
1233
+ content_tag_set_by_llm: true,
1234
+ content_tag_prompt_area: 'classification',
1235
+ content_tag_prompt_key: 'classify_document',
1236
+ content_tag_prompt_return_fieldname: 'document_type',
1237
+ },
1238
+ });
1239
+ console.log(result.contentTag); // e.g., 'invoice'
1240
+ ```
1241
+
1242
+ ### Manual Tagging
1243
+
1244
+ Tag existing files by their database record ID:
1245
+
1246
+ ```typescript
1247
+ const tagResult = await uploadExtract.tagFileContent('file-record-id');
1248
+ if (tagResult.success) {
1249
+ console.log('Tagged as:', tagResult.data);
1250
+ }
1251
+ ```
1252
+
1253
+ ### V3 Database Migration
1254
+
1255
+ If you have an existing `hazo_files` table, run the V3 migration to add the `content_tag` column:
1256
+
1257
+ ```typescript
1258
+ import { migrateToV3, HAZO_FILES_MIGRATION_V3 } from 'hazo_files';
1259
+
1260
+ // Using the migration helper
1261
+ await migrateToV3(
1262
+ { run: (sql) => db.run(sql) },
1263
+ 'sqlite'
1264
+ );
1265
+
1266
+ // Or run statements manually
1267
+ for (const stmt of HAZO_FILES_MIGRATION_V3.sqlite.alterStatements) {
1268
+ try { await db.run(stmt); } catch { /* column exists */ }
1269
+ }
1270
+ ```
1271
+
1272
+ New tables created with `HAZO_FILES_TABLE_SCHEMA` already include the `content_tag` column.
1273
+
1274
+ ## File Reference Tracking
1275
+
1276
+ Track which entities (form fields, chat messages, etc.) reference each file. Multiple entities can reference the same file, enabling shared files without duplication.
1277
+
1278
+ ### Adding and Removing References
1279
+
1280
+ ```typescript
1281
+ import { TrackedFileManager } from 'hazo_files';
1282
+
1283
+ // Upload a file with an initial reference
1284
+ const result = await trackedManager.uploadFileWithRef(buffer, '/docs/report.pdf', {
1285
+ scope_id: 'workspace-123',
1286
+ uploaded_by: 'user-456',
1287
+ ref: {
1288
+ entity_type: 'form_field',
1289
+ entity_id: 'field-789',
1290
+ created_by: 'user-456',
1291
+ },
1292
+ });
1293
+ // result.data.file_id, result.data.ref_id
1294
+
1295
+ // Add another reference to the same file
1296
+ await trackedManager.addRef(fileId, {
1297
+ entity_type: 'chat_message',
1298
+ entity_id: 'msg-abc',
1299
+ });
1300
+
1301
+ // Remove a specific reference
1302
+ const { remaining_refs } = await trackedManager.removeRef(fileId, refId);
1303
+
1304
+ // Get file with status info
1305
+ const fileStatus = await trackedManager.getFileById(fileId);
1306
+ // { record, refs: FileRef[], is_orphaned: boolean }
1307
+ ```
1308
+
1309
+ ### Orphan Detection and Cleanup
1310
+
1311
+ ```typescript
1312
+ // Find files with zero references
1313
+ const orphans = await trackedManager.findOrphanedFiles({
1314
+ olderThanMs: 7 * 24 * 60 * 60 * 1000, // 7 days old
1315
+ scope_id: 'workspace-123',
1316
+ });
1317
+
1318
+ // Clean up orphaned files (delete physical files + DB records)
1319
+ const { cleaned, errors } = await trackedManager.cleanupOrphanedFiles({
1320
+ olderThanMs: 30 * 24 * 60 * 60 * 1000,
1321
+ softDeleteOnly: false, // true to only mark as soft_deleted
1322
+ });
1323
+
1324
+ // Soft-delete a specific file
1325
+ await trackedManager.softDeleteFile(fileId);
1326
+
1327
+ // Verify physical file existence
1328
+ const exists = await trackedManager.verifyFileExistence(fileId);
1329
+ ```
1330
+
1331
+ ### Database Migration (Existing Databases)
1332
+
1333
+ If you have an existing `hazo_files` table, run the V2 migration to add reference tracking columns:
1334
+
1335
+ ```typescript
1336
+ import { migrateToV2, backfillV2Defaults, HAZO_FILES_MIGRATION_V2 } from 'hazo_files';
1337
+
1338
+ // Using the migration helper
1339
+ await migrateToV2(
1340
+ { run: (sql) => db.exec(sql) }, // SQLite
1341
+ 'sqlite'
1342
+ );
1343
+ await backfillV2Defaults({ run: (sql) => db.exec(sql) }, 'sqlite');
1344
+
1345
+ // Or run statements manually
1346
+ for (const stmt of HAZO_FILES_MIGRATION_V2.sqlite.alterStatements) {
1347
+ try { await db.run(stmt); } catch { /* column exists */ }
1348
+ }
1349
+ for (const idx of HAZO_FILES_MIGRATION_V2.sqlite.indexes) {
1350
+ await db.run(idx);
1351
+ }
1352
+ ```
1353
+
1354
+ New tables created with `HAZO_FILES_TABLE_SCHEMA` already include V2 columns. For V3 content tagging migration, see [Content Tagging](#content-tagging) above.
1355
+
1356
+ ### Reference Tracking Types
1357
+
1358
+ ```typescript
1359
+ import type {
1360
+ FileRef, // Individual reference from entity to file
1361
+ FileMetadataRecordV2, // Extended record with refs, status, scope
1362
+ FileWithStatus, // Rich view: record + parsed refs + is_orphaned
1363
+ FileStatus, // 'active' | 'orphaned' | 'soft_deleted' | 'missing'
1364
+ AddRefOptions, // Options for adding a reference
1365
+ RemoveRefsCriteria, // Criteria for bulk ref removal
1366
+ } from 'hazo_files';
1367
+ ```
1368
+
1195
1369
  ## File Change Detection
1196
1370
 
1197
1371
  Detect file content changes using fast xxHash hashing.
@@ -1236,6 +1410,13 @@ const hazoFiles = await createHazoFilesServer({
1236
1410
  },
1237
1411
  enableTracking: true,
1238
1412
  llmFactory: (provider) => createLLM({ provider }),
1413
+ // Optional: enable automatic content tagging for all uploads
1414
+ defaultContentTagConfig: {
1415
+ content_tag_set_by_llm: true,
1416
+ content_tag_prompt_area: 'classification',
1417
+ content_tag_prompt_key: 'classify_document',
1418
+ content_tag_prompt_return_fieldname: 'document_type',
1419
+ },
1239
1420
  });
1240
1421
 
1241
1422
  // Access all services