@magda/semantic-indexer-framework 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/dist/MinioClient.d.ts +7 -0
  2. package/dist/MinioClient.js +37 -0
  3. package/dist/MinioClient.js.map +1 -0
  4. package/dist/SkipError.d.ts +1 -0
  5. package/dist/SkipError.js +1 -0
  6. package/dist/SkipError.js.map +1 -0
  7. package/dist/chunker.d.ts +32 -0
  8. package/dist/chunker.js +84 -0
  9. package/dist/chunker.js.map +1 -0
  10. package/dist/commonYargs.d.ts +43 -0
  11. package/dist/commonYargs.js +61 -0
  12. package/dist/commonYargs.js.map +1 -0
  13. package/dist/createEmbeddingText.d.ts +18 -0
  14. package/dist/createEmbeddingText.js +1 -0
  15. package/dist/createEmbeddingText.js.map +1 -0
  16. package/dist/helpers.d.ts +1 -0
  17. package/dist/helpers.js +10 -0
  18. package/dist/helpers.js.map +1 -0
  19. package/dist/index.d.ts +8 -0
  20. package/dist/index.js +4 -0
  21. package/dist/index.js.map +1 -0
  22. package/dist/indexEmbeddingText.d.ts +22 -0
  23. package/dist/indexEmbeddingText.js +216 -0
  24. package/dist/indexEmbeddingText.js.map +1 -0
  25. package/dist/indexSchema.d.ts +105 -0
  26. package/dist/indexSchema.js +98 -0
  27. package/dist/indexSchema.js.map +1 -0
  28. package/dist/onRecordFoundRegistryRecord.d.ts +7 -0
  29. package/dist/onRecordFoundRegistryRecord.js +47 -0
  30. package/dist/onRecordFoundRegistryRecord.js.map +1 -0
  31. package/dist/onRecordFoundStorageObject.d.ts +9 -0
  32. package/dist/onRecordFoundStorageObject.js +145 -0
  33. package/dist/onRecordFoundStorageObject.js.map +1 -0
  34. package/dist/semanticIndexer.d.ts +2 -0
  35. package/dist/semanticIndexer.js +86 -0
  36. package/dist/semanticIndexer.js.map +1 -0
  37. package/dist/semanticIndexerOptions.d.ts +19 -0
  38. package/dist/semanticIndexerOptions.js +26 -0
  39. package/dist/semanticIndexerOptions.js.map +1 -0
  40. package/dist/test/BaseSemanticIndexerTest.d.ts +41 -0
  41. package/dist/test/BaseSemanticIndexerTest.js +167 -0
  42. package/dist/test/BaseSemanticIndexerTest.js.map +1 -0
  43. package/dist/test/chunker.spec.d.ts +1 -0
  44. package/dist/test/chunker.spec.js +154 -0
  45. package/dist/test/chunker.spec.js.map +1 -0
  46. package/dist/test/embeddingApiClient.spec.d.ts +1 -0
  47. package/dist/test/embeddingApiClient.spec.js +43 -0
  48. package/dist/test/embeddingApiClient.spec.js.map +1 -0
  49. package/dist/test/helpers.d.ts +4 -0
  50. package/dist/test/helpers.js +34 -0
  51. package/dist/test/helpers.js.map +1 -0
  52. package/dist/test/indexEmbeddingText.spec.d.ts +1 -0
  53. package/dist/test/indexEmbeddingText.spec.js +238 -0
  54. package/dist/test/indexEmbeddingText.spec.js.map +1 -0
  55. package/dist/test/mockEmbeddingApi.d.ts +1 -0
  56. package/dist/test/mockEmbeddingApi.js +25 -0
  57. package/dist/test/mockEmbeddingApi.js.map +1 -0
  58. package/dist/test/onRecordFoundRegistryRecord.spec.d.ts +1 -0
  59. package/dist/test/onRecordFoundRegistryRecord.spec.js +155 -0
  60. package/dist/test/onRecordFoundRegistryRecord.spec.js.map +1 -0
  61. package/dist/test/onRecordFoundStorageObject.spec.d.ts +1 -0
  62. package/dist/test/onRecordFoundStorageObject.spec.js +490 -0
  63. package/dist/test/onRecordFoundStorageObject.spec.js.map +1 -0
  64. package/package.json +78 -0
@@ -0,0 +1,105 @@
1
+ import { SemanticIndexerOptions } from "./index.js";
2
+ export declare function createSemanticIndexerMapping(config: SemanticIndexerOptions): {
3
+ indexName: string;
4
+ settings: {
5
+ index: {
6
+ number_of_shards: number;
7
+ number_of_replicas: number;
8
+ };
9
+ "index.knn": boolean;
10
+ };
11
+ mappings: {
12
+ properties: {
13
+ itemType: {
14
+ type: string;
15
+ };
16
+ recordId: {
17
+ type: string;
18
+ };
19
+ aspectId: {
20
+ type: string;
21
+ };
22
+ parentRecordId: {
23
+ type: string;
24
+ };
25
+ fileFormat: {
26
+ type: string;
27
+ };
28
+ subObjectId: {
29
+ type: string;
30
+ };
31
+ subObjectType: {
32
+ type: string;
33
+ };
34
+ index_text_chunk: {
35
+ type: string;
36
+ };
37
+ embedding: {
38
+ method: {
39
+ name: string;
40
+ engine: string;
41
+ parameters: {
42
+ encoder?: {
43
+ name: string;
44
+ parameters: {
45
+ type: string;
46
+ clip: boolean;
47
+ };
48
+ };
49
+ m: number;
50
+ ef_construction: number;
51
+ ef_search: number;
52
+ };
53
+ };
54
+ compression_level?: string;
55
+ type: string;
56
+ dimension: number;
57
+ space_type: string;
58
+ mode: string;
59
+ };
60
+ only_one_index_text_chunk: {
61
+ type: string;
62
+ };
63
+ index_text_chunk_length: {
64
+ type: string;
65
+ };
66
+ index_text_chunk_position: {
67
+ type: string;
68
+ };
69
+ index_text_chunk_overlap: {
70
+ type: string;
71
+ };
72
+ indexerId: {
73
+ type: string;
74
+ };
75
+ createTime: {
76
+ type: string;
77
+ format: string;
78
+ };
79
+ updateTime: {
80
+ type: string;
81
+ format: string;
82
+ };
83
+ };
84
+ };
85
+ };
86
+ export type ItemType = "registryRecord" | "storageObject";
87
+ export interface SemanticIndexDocument {
88
+ itemType: ItemType;
89
+ recordId: string;
90
+ aspectId?: string;
91
+ parentRecordId?: string;
92
+ fileFormat?: string;
93
+ subObjectId?: string;
94
+ subObjectType?: string;
95
+ index_text_chunk: string;
96
+ embedding: number[];
97
+ only_one_index_text_chunk: boolean;
98
+ index_text_chunk_length: number;
99
+ index_text_chunk_position: number;
100
+ index_text_chunk_overlap: number;
101
+ indexerId: string;
102
+ createTime: string;
103
+ updateTime: string;
104
+ }
105
+ export declare function buildSemanticIndexDocument(params: SemanticIndexDocument): SemanticIndexDocument;
@@ -0,0 +1,98 @@
1
+ const SEMANTIC_INDEX_VERSION = 1;
2
+ export function createSemanticIndexerMapping(config) {
3
+ const indexConfig = config.argv.semanticIndexerConfig;
4
+ const knnVectorFieldConfig = indexConfig.knnVectorFieldConfig;
5
+ if (indexConfig.indexVersion !== SEMANTIC_INDEX_VERSION) {
6
+ throw new Error(`Index version mismatch. Expected ${SEMANTIC_INDEX_VERSION}, got ${indexConfig.indexVersion}`);
7
+ }
8
+ if (indexConfig.knnVectorFieldConfig.compressionLevel &&
9
+ indexConfig.knnVectorFieldConfig.encoder) {
10
+ throw new Error("compressionLevel and encoder cannot be used together");
11
+ }
12
+ return {
13
+ indexName: indexConfig.fullIndexName,
14
+ settings: {
15
+ index: {
16
+ number_of_shards: indexConfig.numberOfShards,
17
+ number_of_replicas: indexConfig.numberOfReplicas
18
+ },
19
+ "index.knn": true
20
+ },
21
+ mappings: {
22
+ properties: {
23
+ itemType: { type: "keyword" },
24
+ recordId: { type: "keyword" },
25
+ aspectId: { type: "keyword" },
26
+ parentRecordId: { type: "keyword" },
27
+ fileFormat: { type: "keyword" },
28
+ subObjectId: { type: "keyword" },
29
+ subObjectType: { type: "keyword" },
30
+ index_text_chunk: { type: "keyword" },
31
+ embedding: {
32
+ type: "knn_vector",
33
+ dimension: knnVectorFieldConfig.dimension,
34
+ space_type: knnVectorFieldConfig.spaceType,
35
+ mode: knnVectorFieldConfig.mode,
36
+ ...(knnVectorFieldConfig.compressionLevel ?
37
+ {
38
+ compression_level: knnVectorFieldConfig.compressionLevel
39
+ } :
40
+ {}),
41
+ method: {
42
+ name: "hnsw",
43
+ engine: "faiss",
44
+ parameters: {
45
+ m: knnVectorFieldConfig.m,
46
+ ef_construction: knnVectorFieldConfig.efConstruction,
47
+ ef_search: knnVectorFieldConfig.efSearch,
48
+ ...(knnVectorFieldConfig.encoder ?
49
+ {
50
+ encoder: {
51
+ name: knnVectorFieldConfig.encoder.name,
52
+ parameters: {
53
+ type: knnVectorFieldConfig.encoder.
54
+ type,
55
+ clip: knnVectorFieldConfig.encoder.
56
+ clip
57
+ }
58
+ }
59
+ } :
60
+ {})
61
+ }
62
+ }
63
+ },
64
+ only_one_index_text_chunk: { type: "boolean" },
65
+ index_text_chunk_length: { type: "integer" },
66
+ index_text_chunk_position: { type: "integer" },
67
+ index_text_chunk_overlap: { type: "integer" },
68
+ indexerId: { type: "keyword" },
69
+ createTime: { type: "date", format: "strict_date_time" },
70
+ updateTime: { type: "date", format: "strict_date_time" }
71
+ }
72
+ }
73
+ };
74
+ }
75
+ export function buildSemanticIndexDocument(params) {
76
+ return {
77
+ itemType: params.itemType,
78
+ recordId: params.recordId,
79
+ ...(params.aspectId ? { aspectId: params.aspectId } : {}),
80
+ ...(params.parentRecordId ?
81
+ { parentRecordId: params.parentRecordId } :
82
+ {}),
83
+ ...(params.fileFormat ? { fileFormat: params.fileFormat } : {}),
84
+ ...(params.subObjectId ? { subObjectId: params.subObjectId } : {}),
85
+ ...(params.subObjectType ?
86
+ { subObjectType: params.subObjectType } :
87
+ {}),
88
+ index_text_chunk: params.index_text_chunk,
89
+ embedding: params.embedding,
90
+ only_one_index_text_chunk: params.only_one_index_text_chunk,
91
+ index_text_chunk_length: params.index_text_chunk_length,
92
+ index_text_chunk_position: params.index_text_chunk_position,
93
+ index_text_chunk_overlap: params.index_text_chunk_overlap,
94
+ indexerId: params.indexerId,
95
+ createTime: params.createTime,
96
+ updateTime: params.updateTime
97
+ };
98
+ }
@@ -0,0 +1 @@
1
+ {"version":3,"file":"indexSchema.js","sourceRoot":"","sources":["../src/indexSchema.ts"],"names":[],"mappings":"AAEA,MAAM,sBAAsB,GAAG,CAAC,CAAC;AAEjC,MAAM,UAAU,4BAA4B,CAAC,MAA8B;IACvE,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,qBAAqB,CAAC;IACtD,MAAM,oBAAoB,GAAG,WAAW,CAAC,oBAAoB,CAAC;IAE9D,IAAI,WAAW,CAAC,YAAY,KAAK,sBAAsB,EAAE,CAAC;QACtD,MAAM,IAAI,KAAK,CACX,oCAAoC,sBAAsB,SAAS,WAAW,CAAC,YAAY,EAAE,CAChG,CAAC;IACN,CAAC;IAED,IACI,WAAW,CAAC,oBAAoB,CAAC,gBAAgB;QACjD,WAAW,CAAC,oBAAoB,CAAC,OAAO,EAC1C,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC5E,CAAC;IAED,OAAO;QACH,SAAS,EAAE,WAAW,CAAC,aAAa;QACpC,QAAQ,EAAE;YACN,KAAK,EAAE;gBACH,gBAAgB,EAAE,WAAW,CAAC,cAAc;gBAC5C,kBAAkB,EAAE,WAAW,CAAC,gBAAgB;aACnD;YACD,WAAW,EAAE,IAAI;SACpB;QACD,QAAQ,EAAE;YACN,UAAU,EAAE;gBACR,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7B,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7B,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7B,cAAc,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBACnC,UAAU,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC/B,WAAW,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAChC,aAAa,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAClC,gBAAgB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBACrC,SAAS,EAAE;oBACP,IAAI,EAAE,YAAY;oBAClB,SAAS,EAAE,oBAAoB,CAAC,SAAS;oBACzC,UAAU,EAAE,oBAAoB,CAAC,SAAS;oBAC1C,IAAI,EAAE,oBAAoB,CAAC,IAAI;oBAC/B,GAAG,CAAC,oBAAoB,CAAC,gBAAgB;wBACrC,CAAC,CAAC;4BACI,iBAAiB,EACb,oBAAoB,CAAC,gBAAgB;yBAC5C;wBACH,CAAC,CAAC,EAAE,CAAC;oBACT,MAAM,EAAE;wBACJ,IAAI,EAAE,MAAM;wBACZ,MAAM,EAAE,OAAO;wBACf,UAAU,EAAE;4BACR,CAAC,EAAE,oBAAoB,CAAC,CAAC;4BACzB,eAAe,EACX,oBAAoB,CAAC,cAAc;4BACvC,SAAS,EAAE,oBAAoB,CAAC,QAAQ;4BACxC,GAAG,CAAC,oBAAoB,CAAC,OAAO;gCAC5B,CAAC,CAAC;oCACI,OAAO,EAAE;wCACL,IAAI,EACA,oBAAoB,CAAC,OAAO,CAAC,IAAI;wCACrC,UAAU,EAAE;4CACR,IAAI,EACA,oBAAoB,CAAC,OAAO;iDACvB,IAAI;4CACb,IAAI,EACA,oBAAoB,CAAC,OAAO;iDACvB,IAAI;yCAChB;qCACJ;iCACJ;gCACH,CAAC,CAAC,EAAE,CAAC;yBACZ;qBACJ;iBACJ;gBACD,yBAAyB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC9C,uBAAuB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC5C,yBAAyB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC9C,wBAAwB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7C,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC9B,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,kBAAkB,EAAE;gBACxD,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,kBAAkB,EAAE;aAC3D;SACJ;KACJ,CAAC;AACN,CAAC;AAuBD,MAAM,UAAU,0BAA0B,CACtC,MAA6B;IAE7B,OAAO;QACH,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACzD,GAAG,CAAC,MAAM,CAAC,cAAc;YACrB,CAAC,CAAC,EAAE,cAAc,EAAE,MAAM,CAAC,cAAc,EAAE;YAC3C,CAAC,CAAC,EAAE,CAAC;QACT,GAAG,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC/D,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAClE,GAAG,CAAC,MAAM,CAAC,aAAa;YACpB,CAAC,CAAC,EAAE,aAAa,EAAE,MAAM,CAAC,aAAa,EAAE;YACzC,CAAC,CAAC,EAAE,CAAC;QACT,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,yBAAyB,EAAE,MAAM,CAAC,yBAAyB;QAC3D,uBAAuB,EAAE,MAAM,CAAC,uBAAuB;QACvD,yBAAyB,EAAE,MAAM,CAAC,yBAAyB;QAC3D,wBAAwB,EAAE,MAAM,CAAC,wBAAwB;QACzD,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,UAAU,EAAE,MAAM,CAAC,UAAU;KAChC,CAAC;AACN,CAAC"}
@@ -0,0 +1,7 @@
1
+ import { onRecordFoundType } from "@magda/minion-framework/dist/MinionOptions.js";
2
+ import { Chunker } from "./chunker.js";
3
+ import EmbeddingApiClient from "@magda/typescript-common/dist/EmbeddingApiClient.js";
4
+ import OpensearchApiClient from "@magda/typescript-common/dist/OpensearchApiClient.js";
5
+ import SemanticIndexerOptions from "./semanticIndexerOptions.js";
6
+ import Registry from "@magda/typescript-common/dist/registry/AuthorizedRegistryClient.js";
7
+ export declare const onRecordFoundRegistryRecord: (options: SemanticIndexerOptions, chunker: Chunker, embeddingApiClient: EmbeddingApiClient, opensearchApiClient: OpensearchApiClient, registryReadonlyClient: Registry) => onRecordFoundType;
@@ -0,0 +1,47 @@
1
+ import { indexEmbeddingText } from "./indexEmbeddingText.js";
2
+ import { SkipError } from "./SkipError.js";
3
+ // The onRecordFound function passed to minion sdk to handle registry record records
4
+ export const onRecordFoundRegistryRecord = (options, chunker, embeddingApiClient, opensearchApiClient, registryReadonlyClient) => {
5
+ return async (record, _registry) => {
6
+ try {
7
+ if (!record.aspects ||
8
+ !options.aspects?.every((aspect) => aspect in record.aspects)) {
9
+ return;
10
+ }
11
+ let embeddingText;
12
+ try {
13
+ embeddingText = await options.createEmbeddingText({
14
+ record,
15
+ format: null,
16
+ filePath: null,
17
+ url: null,
18
+ readonlyRegistry: registryReadonlyClient
19
+ });
20
+ if (!embeddingText.text && !embeddingText.subObjects) {
21
+ throw new SkipError("User-provided createEmbeddingText function returned no text or subObjects");
22
+ }
23
+ }
24
+ catch (err) {
25
+ throw new SkipError(`Error in user-provided createEmbeddingText function: ${err.message}`);
26
+ }
27
+ await indexEmbeddingText({
28
+ options,
29
+ chunker,
30
+ embeddingApiClient,
31
+ opensearchApiClient,
32
+ embeddingText,
33
+ metadata: {
34
+ recordId: record.id,
35
+ fileFormat: record.aspects["dataset-format"]?.format
36
+ }
37
+ });
38
+ }
39
+ catch (err) {
40
+ if (err instanceof SkipError) {
41
+ console.warn(`Skipping record ${record.id} because:`, err.message);
42
+ return;
43
+ }
44
+ throw err;
45
+ }
46
+ };
47
+ };
@@ -0,0 +1 @@
1
+ {"version":3,"file":"onRecordFoundRegistryRecord.js","sourceRoot":"","sources":["../src/onRecordFoundRegistryRecord.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAE7D,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAG3C,oFAAoF;AACpF,MAAM,CAAC,MAAM,2BAA2B,GAAG,CACvC,OAA+B,EAC/B,OAAgB,EAChB,kBAAsC,EACtC,mBAAwC,EACxC,sBAAgC,EACf,EAAE;IACnB,OAAO,KAAK,EAAE,MAAc,EAAE,SAAS,EAAE,EAAE;QACvC,IAAI,CAAC;YACD,IACI,CAAC,MAAM,CAAC,OAAO;gBACf,CAAC,OAAO,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,IAAI,MAAM,CAAC,OAAO,CAAC,EAC/D,CAAC;gBACC,OAAO;YACX,CAAC;YAED,IAAI,aAAa,CAAC;YAClB,IAAI,CAAC;gBACD,aAAa,GAAG,MAAM,OAAO,CAAC,mBAAmB,CAAC;oBAC9C,MAAM;oBACN,MAAM,EAAE,IAAI;oBACZ,QAAQ,EAAE,IAAI;oBACd,GAAG,EAAE,IAAI;oBACT,gBAAgB,EAAE,sBAAsB;iBAC3C,CAAC,CAAC;gBAEH,IAAI,CAAC,aAAa,CAAC,IAAI,IAAI,CAAC,aAAa,CAAC,UAAU,EAAE,CAAC;oBACnD,MAAM,IAAI,SAAS,CACf,2EAA2E,CAC9E,CAAC;gBACN,CAAC;YACL,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACX,MAAM,IAAI,SAAS,CACf,wDACK,GAAa,CAAC,OACnB,EAAE,CACL,CAAC;YACN,CAAC;YAED,MAAM,kBAAkB,CAAC;gBACrB,OAAO;gBACP,OAAO;gBACP,kBAAkB;gBAClB,mBAAmB;gBACnB,aAAa;gBACb,QAAQ,EAAE;oBACN,QAAQ,EAAE,MAAM,CAAC,EAAE;oBACnB,UAAU,EAAE,MAAM,CAAC,OAAO,CAAC,gBAAgB,CAAC,EAAE,MAAM;iBACvD;aACJ,CAAC,CAAC;QACP,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,IAAI,GAAG,YAAY,SAAS,EAAE,CAAC;gBAC3B,OAAO,CAAC,IAAI,CACR,mBAAmB,MAAM,CAAC,EAAE,WAAW,EACvC,GAAG,CAAC,OAAO,CACd,CAAC;gBACF,OAAO;YACX,CAAC;YACD,MAAM,GAAG,CAAC;QACd,CAAC;IACL,CAAC,CAAC;AACN,CAAC,CAAC"}
@@ -0,0 +1,9 @@
1
+ import { onRecordFoundType } from "@magda/minion-framework/dist/MinionOptions.js";
2
+ import { Chunker } from "./chunker.js";
3
+ import EmbeddingApiClient from "@magda/typescript-common/dist/EmbeddingApiClient.js";
4
+ import OpensearchApiClient from "@magda/typescript-common/dist/OpensearchApiClient.js";
5
+ import SemanticIndexerOptions from "./semanticIndexerOptions.js";
6
+ import Registry from "@magda/typescript-common/dist/registry/AuthorizedRegistryClient.js";
7
+ import { MinioClient } from "./MinioClient.js";
8
+ export declare const onRecordFoundStorageObject: (options: SemanticIndexerOptions, chunker: Chunker, embeddingApiClient: EmbeddingApiClient, opensearchApiClient: OpensearchApiClient, minioClient: MinioClient, registryReadonlyClient: Registry) => onRecordFoundType;
9
+ export declare function getParentRecordId(distributionId: string, registry: Registry): Promise<string | null>;
@@ -0,0 +1,145 @@
1
+ import { indexEmbeddingText } from "./indexEmbeddingText.js";
2
+ import retry from "@magda/typescript-common/dist/retry.js";
3
+ import fetch from "node-fetch";
4
+ import * as fs from "fs";
5
+ import { tmpdir } from "os";
6
+ import { join } from "path";
7
+ import { v4 as uuidv4 } from "uuid";
8
+ import { SkipError } from "./SkipError.js";
9
+ import urijs from "urijs";
10
+ import { pipeline } from "stream/promises";
11
+ import ServerError from "@magda/typescript-common/dist/ServerError.js";
12
+ import { deleteTempFile } from "./helpers.js";
13
+ // The onRecordFound function passed to minion sdk to handle storage object records
14
+ export const onRecordFoundStorageObject = (options, chunker, embeddingApiClient, opensearchApiClient, minioClient, registryReadonlyClient) => {
15
+ return async (dist, registry) => {
16
+ try {
17
+ const datasetFormat = dist.aspects?.["dataset-format"]?.format;
18
+ const dcatDist = dist.aspects?.["dcat-distribution-strings"] || {};
19
+ const { format: dcatFormat, downloadURL, accessURL } = dcatDist;
20
+ const fileDownloadURL = downloadURL || accessURL;
21
+ let format = datasetFormat || dcatFormat;
22
+ if (!format && fileDownloadURL) {
23
+ format = new urijs(fileDownloadURL).suffix().toUpperCase();
24
+ }
25
+ // filler record
26
+ if (!format ||
27
+ !fileDownloadURL ||
28
+ !options.formatTypes?.some((f) => format.toLowerCase().includes(f.toLowerCase()))) {
29
+ return;
30
+ }
31
+ let embeddingText;
32
+ let filePath = null;
33
+ const parentRecordId = await getParentRecordId(dist.id, registryReadonlyClient);
34
+ try {
35
+ try {
36
+ if (options.autoDownloadFile === undefined ||
37
+ options.autoDownloadFile) {
38
+ filePath = await downloadFileWithRetry(fileDownloadURL, minioClient);
39
+ }
40
+ }
41
+ catch (err) {
42
+ throw new SkipError(`Error in downloading file: ${err.message}`);
43
+ }
44
+ try {
45
+ embeddingText = await options.createEmbeddingText({
46
+ record: dist,
47
+ format: format,
48
+ filePath,
49
+ url: fileDownloadURL,
50
+ readonlyRegistry: registryReadonlyClient
51
+ });
52
+ if (!embeddingText.text && !embeddingText.subObjects) {
53
+ throw new SkipError("User-provided createEmbeddingText function returned no text or subObjects");
54
+ }
55
+ }
56
+ catch (err) {
57
+ throw new SkipError(`Error in user-provided createEmbeddingText function: ${err.message}`);
58
+ }
59
+ } finally
60
+ {
61
+ if (filePath) {
62
+ await deleteTempFile(filePath);
63
+ }
64
+ }
65
+ await indexEmbeddingText({
66
+ options,
67
+ chunker,
68
+ embeddingApiClient,
69
+ opensearchApiClient,
70
+ embeddingText,
71
+ metadata: {
72
+ recordId: dist.id,
73
+ parentRecordId: parentRecordId,
74
+ aspectId: dist.aspects["dataset-format"]?.id,
75
+ fileFormat: format
76
+ }
77
+ });
78
+ }
79
+ catch (err) {
80
+ if (err instanceof SkipError) {
81
+ console.warn(`Skipping distribution ${dist.id} because:`, err.message);
82
+ return;
83
+ }
84
+ throw err;
85
+ }
86
+ };
87
+ };
88
+ async function downloadFileWithRetry(url, minioClient) {
89
+ return retry(() => downloadFile(url, minioClient), 1, 5, (err, retries) => {});
90
+ }
91
+ async function downloadFile(url, minioClient) {
92
+ const uri = urijs(url);
93
+ if (uri.protocol() === "magda" && uri.hostname() === "storage-api") {
94
+ try {
95
+ return await minioClient.downloadFile(url);
96
+ }
97
+ catch (err) {
98
+ throw new SkipError(`Failed to download file from Minio: ${err.message}`);
99
+ }
100
+ }
101
+ let response;
102
+ try {
103
+ response = await fetch(url);
104
+ }
105
+ catch (err) {
106
+ throw new SkipError(`Failed to download file because network error`);
107
+ }
108
+ if (!response.ok) {
109
+ throw new SkipError(`Failed to download file because HTTP error ${response.status}`);
110
+ }
111
+ if (!response.body) {
112
+ throw new SkipError("No response body to write to file");
113
+ }
114
+ const tempDir = tmpdir();
115
+ const tempFileName = `${uuidv4()}`;
116
+ const suffix = new urijs(url).suffix();
117
+ const tempFilePath = join(tempDir, `${tempFileName}.${suffix}`);
118
+ try {
119
+ const writeStream = fs.createWriteStream(tempFilePath);
120
+ await pipeline(response.body, writeStream);
121
+ return tempFilePath;
122
+ }
123
+ catch (err) {
124
+ await deleteTempFile(tempFilePath);
125
+ throw new SkipError(`Failed to write file`);
126
+ }
127
+ }
128
+ export async function getParentRecordId(distributionId, registry) {
129
+ try {
130
+ const result = await registry.getRecords(["dataset-distributions"], undefined, undefined, true, undefined, ["dataset-distributions.distributions:<|" + distributionId]);
131
+ if (result instanceof ServerError) {
132
+ console.error(`Failed to get parent record id: ${result.message}`);
133
+ return null;
134
+ }
135
+ if (!("records" in result)) {
136
+ console.error(`Failed to get parent record id`);
137
+ return null;
138
+ }
139
+ return result.records[0]?.id || null;
140
+ }
141
+ catch (e) {
142
+ console.error(`Unexpected error when getting parent record id`, e);
143
+ return null;
144
+ }
145
+ }
@@ -0,0 +1 @@
1
+ {"version":3,"file":"onRecordFoundStorageObject.js","sourceRoot":"","sources":["../src/onRecordFoundStorageObject.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAG7D,OAAO,KAAK,MAAM,sCAAsC,CAAC;AACzD,OAAO,KAAK,MAAM,YAAY,CAAC;AAC/B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,MAAM,EAAE,MAAM,IAAI,CAAC;AAC5B,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAE3C,OAAO,WAAW,MAAM,4CAA4C,CAAC;AAErE,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,mFAAmF;AACnF,MAAM,CAAC,MAAM,0BAA0B,GAAG,CACtC,OAA+B,EAC/B,OAAgB,EAChB,kBAAsC,EACtC,mBAAwC,EACxC,WAAwB,EACxB,sBAAgC,EACf,EAAE;IACnB,OAAO,KAAK,EAAE,IAAY,EAAE,QAAQ,EAAE,EAAE;QACpC,IAAI,CAAC;YACD,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;YAC/D,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,2BAA2B,CAAC,IAAI,EAAE,CAAC;YACnE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,GAAG,QAAQ,CAAC;YAChE,MAAM,eAAe,GAAG,WAAW,IAAI,SAAS,CAAC;YACjD,IAAI,MAAM,GAAG,aAAa,IAAI,UAAU,CAAC;YACzC,IAAI,CAAC,MAAM,IAAI,eAAe,EAAE,CAAC;gBAC7B,MAAM,GAAG,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC,MAAM,EAAE,CAAC,WAAW,EAAE,CAAC;YAC/D,CAAC;YAED,gBAAgB;YAChB,IACI,CAAC,MAAM;gBACP,CAAC,eAAe;gBAChB,CAAC,OAAO,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAC7B,MAAM,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CACjD,EACH,CAAC;gBACC,OAAO;YACX,CAAC;YAED,IAAI,aAA4B,CAAC;YACjC,IAAI,QAAQ,GAAkB,IAAI,CAAC;YAEnC,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAC1C,IAAI,CAAC,EAAE,EACP,sBAAsB,CACzB,CAAC;YACF,IAAI,CAAC;gBACD,IAAI,CAAC;oBACD,IACI,OAAO,CAAC,gBAAgB,KAAK,SAAS;wBACtC,OAAO,CAAC,gBAAgB,EAC1B,CAAC;wBACC,QAAQ,GAAG,MAAM,qBAAqB,CAClC,eAAe,EACf,WAAW,CACd,CAAC;oBACN,CAAC;gBACL,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACX,MAAM,IAAI,SAAS,CACf,8BAA+B,GAAa,CAAC,OAAO,EAAE,CACzD,CAAC;gBACN,CAAC;gBACD,IAAI,CAAC;oBACD,aAAa,GAAG,MAAM,OAAO,CAAC,mBAAmB,CAAC;wBAC9C,MAAM,EAAE,IAAI;wBACZ,MAAM,EAAE,MAAM;wBACd,QAAQ;wBACR,GAAG,EAAE,eAAe;wBACpB,gBAAgB,EAAE,sBAAsB;qBAC3C,CAAC,CAAC;oBAEH,IAAI,CAAC,aAAa,CAAC,IAAI,IAAI,CAAC,aAAa,CAAC,UAAU,EAAE,CAAC;wBACnD,MAAM,IAAI,SAAS,CACf,2EAA2E,CAC9E,CAAC;oBACN,CAAC;gBACL,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACX,MAAM,IAAI,SAAS,CACf,wDACK,GAAa,CAAC,OACnB,EAAE,CACL,CAAC;gBACN,CAAC;YACL,CAAC;oBAAS,CAAC;gBACP,IAAI,QAAQ,EAAE,CAAC;oBACX,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAC;gBACnC,CAAC;YACL,CAAC;YAED,MAAM,kBAAkB,CAAC;gBACrB,OAAO;gBACP,OAAO;gBACP,kBAAkB;gBAClB,mBAAmB;gBACnB,aAAa;gBACb,QAAQ,EAAE;oBACN,QAAQ,EAAE,IAAI,CAAC,EAAE;oBACjB,cAAc,EAAE,cAAc;oBAC9B,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,EAAE,EAAE;oBAC5C,UAAU,EAAE,MAAM;iBACrB;aACJ,CAAC,CAAC;QACP,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,IAAI,GAAG,YAAY,SAAS,EAAE,CAAC;gBAC3B,OAAO,CAAC,IAAI,CACR,yBAAyB,IAAI,CAAC,EAAE,WAAW,EAC3C,GAAG,CAAC,OAAO,CACd,CAAC;gBACF,OAAO;YACX,CAAC;YACD,MAAM,GAAG,CAAC;QACd,CAAC;IACL,CAAC,CAAC;AACN,CAAC,CAAC;AAEF,KAAK,UAAU,qBAAqB,CAChC,GAAW,EACX,WAAwB;IAExB,OAAO,KAAK,CACR,GAAG,EAAE,CAAC,YAAY,CAAC,GAAG,EAAE,WAAW,CAAC,EACpC,CAAC,EACD,CAAC,EACD,CAAC,GAAG,EAAE,OAAO,EAAE,EAAE,GAAE,CAAC,CACvB,CAAC;AACN,CAAC;AAED,KAAK,UAAU,YAAY,CACvB,GAAW,EACX,WAAwB;IAExB,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;IACvB,IAAI,GAAG,CAAC,QAAQ,EAAE,KAAK,OAAO,IAAI,GAAG,CAAC,QAAQ,EAAE,KAAK,aAAa,EAAE,CAAC;QACjE,IAAI,CAAC;YACD,OAAO,MAAM,WAAW,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,MAAM,IAAI,SAAS,CACf,uCAAwC,GAAa,CAAC,OAAO,EAAE,CAClE,CAAC;QACN,CAAC;IACL,CAAC;IAED,IAAI,QAAQ,CAAC;IACb,IAAI,CAAC;QACD,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,MAAM,IAAI,SAAS,CAAC,+CAA+C,CAAC,CAAC;IACzE,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACf,MAAM,IAAI,SAAS,CACf,8CAA8C,QAAQ,CAAC,MAAM,EAAE,CAClE,CAAC;IACN,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACjB,MAAM,IAAI,SAAS,CAAC,mCAAmC,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC;IACzB,MAAM,YAAY,GAAG,GAAG,MAAM,EAAE,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC;IACvC,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,YAAY,IAAI,MAAM,EAAE,CAAC,CAAC;IAEhE,IAAI,CAAC;QACD,MAAM,WAAW,GAAG,EAAE,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;QACvD,MAAM,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAC3C,OAAO,YAAY,CAAC;IACxB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,MAAM,cAAc,CAAC,YAAY,CAAC,CAAC;QACnC,MAAM,IAAI,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAChD,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACnC,cAAsB,EACtB,QAAkB;IAElB,IAAI,CAAC;QACD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,UAAU,CACpC,CAAC,uBAAuB,CAAC,EACzB,SAAS,EACT,SAAS,EACT,IAAI,EACJ,SAAS,EACT,CAAC,wCAAwC,GAAG,cAAc,CAAC,CAC9D,CAAC;QAEF,IAAI,MAAM,YAAY,WAAW,EAAE,CAAC;YAChC,OAAO,CAAC,KAAK,CAAC,mCAAmC,MAAM,CAAC,OAAO,EAAE,CAAC,CAAC;YACnE,OAAO,IAAI,CAAC;QAChB,CAAC;QACD,IAAI,CAAC,CAAC,SAAS,IAAI,MAAM,CAAC,EAAE,CAAC;YACzB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO,IAAI,CAAC;QAChB,CAAC;QAED,OAAO,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,IAAI,IAAI,CAAC;IACzC,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CAAC,gDAAgD,EAAE,CAAC,CAAC,CAAC;QACnE,OAAO,IAAI,CAAC;IAChB,CAAC;AACL,CAAC"}
@@ -0,0 +1,2 @@
1
+ import SemanticIndexerOptions from "./semanticIndexerOptions.js";
2
+ export default function semanticIndexer(userConfig: SemanticIndexerOptions): Promise<void>;
@@ -0,0 +1,86 @@
1
+ import minion from "@magda/minion-framework/dist/index.js";
2
+ import { Chunker, UserDefinedChunkStrategy, RecursiveChunkStrategy } from "./chunker.js";
3
+ import { onRecordFoundRegistryRecord } from "./onRecordFoundRegistryRecord.js";
4
+ import { onRecordFoundStorageObject } from "./onRecordFoundStorageObject.js";
5
+ import { createSemanticIndexerMapping } from "./indexSchema.js";
6
+ import EmbeddingApiClient from "@magda/typescript-common/dist/EmbeddingApiClient.js";
7
+ import OpensearchApiClient from "@magda/typescript-common/dist/OpensearchApiClient.js";
8
+ import { validateSemanticIndexerOptions } from "./semanticIndexerOptions.js";
9
+ import retry from "@magda/typescript-common/dist/retry.js";
10
+ import { MinioClient } from "./MinioClient.js";
11
+ import { MAGDA_SYSTEM_ID } from "@magda/typescript-common/dist/registry/TenantConsts.js";
12
+ import Registry from "@magda/typescript-common/dist/registry/AuthorizedRegistryClient.js";
13
+ // Main function for semantic indexer
14
+ export default async function semanticIndexer(userConfig) {
15
+ try {
16
+ validateSemanticIndexerOptions(userConfig);
17
+ const semanticIndexerConfig = userConfig.argv.semanticIndexerConfig;
18
+ const opensearchApiClient = await retry(() => OpensearchApiClient.getOpensearchApiClient({
19
+ url: userConfig.argv.opensearchApiURL
20
+ }), 5, 5, (e, left) => console.error(`Opensearch connection failed, remaining retries: ${left}, error:`, e.message));
21
+ const embeddingApiClient = await retry(() => Promise.resolve(new EmbeddingApiClient({
22
+ baseApiUrl: userConfig.argv.embeddingApiURL
23
+ })), 5, 5, (e, left) => console.error(`Embedding API connection failed, remaining retries: ${left}, error:`, e.message));
24
+ const registryReadonlyClient = new Registry({
25
+ baseUrl: userConfig.argv.registryReadonlyURL,
26
+ jwtSecret: userConfig.argv.jwtSecret,
27
+ userId: userConfig.argv.userId,
28
+ maxRetries: 3,
29
+ tenantId: MAGDA_SYSTEM_ID
30
+ });
31
+ if (!(await opensearchApiClient.indexExists(semanticIndexerConfig.fullIndexName))) {
32
+ const indexDefinition = createSemanticIndexerMapping(userConfig);
33
+ await opensearchApiClient.createIndex(indexDefinition);
34
+ }
35
+ const chunkStrategy = userConfig.chunkStrategy ?
36
+ new UserDefinedChunkStrategy(userConfig.chunkStrategy) :
37
+ new RecursiveChunkStrategy(semanticIndexerConfig.chunkSizeLimit ||
38
+ userConfig.chunkSizeLimit, semanticIndexerConfig.overlap || userConfig.overlap);
39
+ const chunker = new Chunker(chunkStrategy);
40
+ let onRecordFound;
41
+ let minionOptions;
42
+ if (userConfig.itemType === "registryRecord") {
43
+ onRecordFound = onRecordFoundRegistryRecord(userConfig, chunker, embeddingApiClient, opensearchApiClient, registryReadonlyClient);
44
+ minionOptions = {
45
+ argv: userConfig.argv,
46
+ id: userConfig.id,
47
+ aspects: userConfig.aspects || [],
48
+ optionalAspects: userConfig.optionalAspects || [],
49
+ writeAspectDefs: [],
50
+ async: true,
51
+ dereference: true,
52
+ includeEvents: false,
53
+ includeRecords: true,
54
+ onRecordFound: onRecordFound
55
+ };
56
+ } else
57
+ if (userConfig.itemType === "storageObject") {
58
+ const minioClient = new MinioClient(userConfig.argv.minioConfig, userConfig.argv.minioAccessKey, userConfig.argv.minioSecretKey);
59
+ onRecordFound = onRecordFoundStorageObject(userConfig, chunker, embeddingApiClient, opensearchApiClient, minioClient, registryReadonlyClient);
60
+ minionOptions = {
61
+ argv: userConfig.argv,
62
+ id: userConfig.id,
63
+ aspects: ["dcat-distribution-strings", "dataset-format"],
64
+ optionalAspects: [],
65
+ writeAspectDefs: [],
66
+ async: true,
67
+ dereference: false,
68
+ includeEvents: false,
69
+ includeRecords: true,
70
+ onRecordFound: onRecordFound,
71
+ maxRetries: 3
72
+ };
73
+ } else
74
+ {
75
+ throw new Error("Invalid itemType");
76
+ }
77
+ minion(minionOptions).catch((e) => {
78
+ console.error("Minion execution error: " + e.message, e);
79
+ process.exit(1);
80
+ });
81
+ }
82
+ catch (e) {
83
+ console.error("semanticIndexer initialization error: " + e.message, e);
84
+ process.exit(1);
85
+ }
86
+ }
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semanticIndexer.js","sourceRoot":"","sources":["../src/semanticIndexer.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,qCAAqC,CAAC;AACzD,OAAO,EACH,OAAO,EACP,wBAAwB,EACxB,sBAAsB,EACzB,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,2BAA2B,EAAE,MAAM,kCAAkC,CAAC;AAC/E,OAAO,EAAE,0BAA0B,EAAE,MAAM,iCAAiC,CAAC;AAC7E,OAAO,EAAE,4BAA4B,EAAE,MAAM,kBAAkB,CAAC;AAChE,OAAO,kBAAkB,MAAM,mDAAmD,CAAC;AACnF,OAAO,mBAAmB,MAAM,oDAAoD,CAAC;AACrF,OAA+B,EAC3B,8BAA8B,EACjC,MAAM,6BAA6B,CAAC;AAIrC,OAAO,KAAK,MAAM,sCAAsC,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,eAAe,EAAE,MAAM,sDAAsD,CAAC;AACvF,OAAO,QAAQ,MAAM,kEAAkE,CAAC;AAExF,qCAAqC;AACrC,MAAM,CAAC,OAAO,CAAC,KAAK,UAAU,eAAe,CACzC,UAAkC;IAElC,IAAI,CAAC;QACD,8BAA8B,CAAC,UAAU,CAAC,CAAC;QAC3C,MAAM,qBAAqB,GAAG,UAAU,CAAC,IAAI,CAAC,qBAAqB,CAAC;QACpE,MAAM,mBAAmB,GAAG,MAAM,KAAK,CACnC,GAAG,EAAE,CACD,mBAAmB,CAAC,sBAAsB,CAAC;YACvC,GAAG,EAAE,UAAU,CAAC,IAAI,CAAC,gBAAgB;SACxC,CAAC,EACN,CAAC,EACD,CAAC,EACD,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CACR,OAAO,CAAC,KAAK,CACT,oDAAoD,IAAI,UAAU,EAClE,CAAC,CAAC,OAAO,CACZ,CACR,CAAC;QAEF,MAAM,kBAAkB,GAAG,MAAM,KAAK,CAClC,GAAG,EAAE,CACD,OAAO,CAAC,OAAO,CACX,IAAI,kBAAkB,CAAC;YACnB,UAAU,EAAE,UAAU,CAAC,IAAI,CAAC,eAAe;SAC9C,CAAC,CACL,EACL,CAAC,EACD,CAAC,EACD,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CACR,OAAO,CAAC,KAAK,CACT,uDAAuD,IAAI,UAAU,EACrE,CAAC,CAAC,OAAO,CACZ,CACR,CAAC;QAEF,MAAM,sBAAsB,GAAG,IAAI,QAAQ,CAAC;YACxC,OAAO,EAAE,UAAU,CAAC,IAAI,CAAC,mBAAmB;YAC5C,SAAS,EAAE,UAAU,CAAC,IAAI,CAAC,SAAS;YACpC,MAAM,EAAE,UAAU,CAAC,IAAI,CAAC,MAAM;YAC9B,UAAU,EAAE,CAAC;YACb,QAAQ,EAAE,eAAe;SAC5B,CAAC,CAAC;QAEH,IACI,CAAC,CAAC,MAAM,mBAAmB,CAAC,WAAW,CACnC,qBAAqB,CAAC,aAAa,CACtC,CAAC,EACJ,CAAC;YACC,MAAM,eAAe,GAAG,4BAA4B,CAAC,UAAU,CAAC,CAAC;YACjE,MAAM,mBAAmB,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,aAAa,GAAG,UAAU,CAAC,aAAa;YAC1C,CAAC,CAAC,IAAI,wBAAwB,CAAC,UAAU,CAAC,aAAa,CAAC;YACxD,CAAC,CAAC,IAAI,sBAAsB,CACtB,qBAAqB,CAAC,cAAc;gBAChC,UAAU,CAAC,cAAc,EAC7B,qBAAqB,CAAC,OAAO,IAAI,UAAU,CAAC,OAAO,CACtD,CAAC;QAER,MAAM,OAAO,GAAG,IAAI,OAAO,CAAC,aAAa,CAAC,CAAC;QAE3C,IAAI,aAAgC,CAAC;QACrC,IAAI,aAA4B,CAAC;QAEjC,IAAI,UAAU,CAAC,QAAQ,KAAK,gBAAgB,EAAE,CAAC;YAC3C,aAAa,GAAG,2BAA2B,CACvC,UAAU,EACV,OAAO,EACP,kBAAkB,EAClB,mBAAmB,EACnB,sBAAsB,CACzB,CAAC;YACF,aAAa,GAAG;gBACZ,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,EAAE,EAAE,UAAU,CAAC,EAAE;gBACjB,OAAO,EAAE,UAAU,CAAC,OAAO,IAAI,EAAE;gBACjC,eAAe,EAAE,UAAU,CAAC,eAAe,IAAI,EAAE;gBACjD,eAAe,EAAE,EAAE;gBACnB,KAAK,EAAE,IAAI;gBACX,WAAW,EAAE,IAAI;gBACjB,aAAa,EAAE,KAAK;gBACpB,cAAc,EAAE,IAAI;gBACpB,aAAa,EAAE,aAAa;aAC/B,CAAC;QACN,CAAC;aAAM,IAAI,UAAU,CAAC,QAAQ,KAAK,eAAe,EAAE,CAAC;YACjD,MAAM,WAAW,GAAG,IAAI,WAAW,CAC/B,UAAU,CAAC,IAAI,CAAC,WAAW,EAC3B,UAAU,CAAC,IAAI,CAAC,cAAc,EAC9B,UAAU,CAAC,IAAI,CAAC,cAAc,CACjC,CAAC;YACF,aAAa,GAAG,0BAA0B,CACtC,UAAU,EACV,OAAO,EACP,kBAAkB,EAClB,mBAAmB,EACnB,WAAW,EACX,sBAAsB,CACzB,CAAC;YACF,aAAa,GAAG;gBACZ,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,EAAE,EAAE,UAAU,CAAC,EAAE;gBACjB,OAAO,EAAE,CAAC,2BAA2B,EAAE,gBAAgB,CAAC;gBACxD,eAAe,EAAE,EAAE;gBACnB,eAAe,EAAE,EAAE;gBACnB,KAAK,EAAE,IAAI;gBACX,WAAW,EAAE,KAAK;gBAClB,aAAa,EAAE,KAAK;gBACpB,cAAc,EAAE,IAAI;gBACpB,aAAa,EAAE,aAAa;gBAC5B,UAAU,EAAE,CAAC;aAChB,CAAC;QACN,CAAC;aAAM,CAAC;YACJ,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,CAAC,aAAa,CAAC,CAAC,KAAK,CAAC,CAAC,CAAQ,EAAE,EAAE;YACrC,OAAO,CAAC,KAAK,CAAC,0BAA0B,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC,CAAC,CAAC;IACP,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CACT,wCAAwC,GAAI,CAAW,CAAC,OAAO,EAC/D,CAAC,CACJ,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;AACL,CAAC"}
@@ -0,0 +1,19 @@
1
+ import { ItemType } from "./indexSchema.js";
2
+ import { CreateEmbeddingText } from "./createEmbeddingText.js";
3
+ import { SemanticIndexerArguments } from "./commonYargs.js";
4
+ import { ChunkStrategyType } from "./chunker.js";
5
+ export default interface SemanticIndexerOptions {
6
+ argv: SemanticIndexerArguments;
7
+ id: string;
8
+ itemType: ItemType;
9
+ aspects?: string[];
10
+ optionalAspects?: string[];
11
+ formatTypes?: string[];
12
+ createEmbeddingText: CreateEmbeddingText;
13
+ chunkStrategy?: ChunkStrategyType;
14
+ chunkSizeLimit?: number;
15
+ overlap?: number;
16
+ autoDownloadFile?: boolean;
17
+ timeout?: string;
18
+ }
19
+ export declare function validateSemanticIndexerOptions(options: SemanticIndexerOptions): void;
@@ -0,0 +1,26 @@
1
+ export function validateSemanticIndexerOptions(options) {
2
+ if (options.chunkSizeLimit !== undefined && (
3
+ !Number.isInteger(options.chunkSizeLimit) ||
4
+ options.chunkSizeLimit <= 0)) {
5
+ throw new Error("'chunkSizeLimit' must be a positive integer");
6
+ }
7
+ if (options.overlap !== undefined && (
8
+ !Number.isInteger(options.overlap) || options.overlap < 0)) {
9
+ throw new Error("'overlap' must be a non-negative integer");
10
+ }
11
+ if (options.chunkSizeLimit &&
12
+ options.overlap &&
13
+ options.chunkSizeLimit <= options.overlap * 2) {
14
+ throw new Error("'overlap' must be less than half of 'chunkSizeLimit'");
15
+ }
16
+ if (options.itemType === "registryRecord") {
17
+ if (!options.aspects || options.aspects.length === 0) {
18
+ throw new Error("'aspects' is required");
19
+ }
20
+ }
21
+ if (options.itemType === "storageObject") {
22
+ if (!options.formatTypes || options.formatTypes.length === 0) {
23
+ throw new Error("'formatTypes' is required");
24
+ }
25
+ }
26
+ }
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semanticIndexerOptions.js","sourceRoot":"","sources":["../src/semanticIndexerOptions.ts"],"names":[],"mappings":"AAqBA,MAAM,UAAU,8BAA8B,CAC1C,OAA+B;IAE/B,IACI,OAAO,CAAC,cAAc,KAAK,SAAS;QACpC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,OAAO,CAAC,cAAc,CAAC;YACtC,OAAO,CAAC,cAAc,IAAI,CAAC,CAAC,EAClC,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACnE,CAAC;IACD,IACI,OAAO,CAAC,OAAO,KAAK,SAAS;QAC7B,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC,CAAC,EAC7D,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAChE,CAAC;IACD,IACI,OAAO,CAAC,cAAc;QACtB,OAAO,CAAC,OAAO;QACf,OAAO,CAAC,cAAc,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC,EAC/C,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC5E,CAAC;IACD,IAAI,OAAO,CAAC,QAAQ,KAAK,gBAAgB,EAAE,CAAC;QACxC,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;QAC7C,CAAC;IACL,CAAC;IACD,IAAI,OAAO,CAAC,QAAQ,KAAK,eAAe,EAAE,CAAC;QACvC,IAAI,CAAC,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3D,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;QACjD,CAAC;IACL,CAAC;AACL,CAAC"}
@@ -0,0 +1,41 @@
1
+ import { SinonStub, SinonFakeTimers } from "sinon";
2
+ import SemanticIndexerOptions from "../semanticIndexerOptions.js";
3
+ export declare class BaseSemanticIndexerTest {
4
+ chunker: any;
5
+ embeddingApiClient: any;
6
+ opensearchApiClient: any;
7
+ minioClient: any;
8
+ registry: any;
9
+ createEmbeddingTextStub: SinonStub;
10
+ consoleLogStub: SinonStub;
11
+ consoleWarnStub: SinonStub;
12
+ consoleErrorStub: SinonStub;
13
+ clock: SinonFakeTimers;
14
+ DEFAULT_FAKE_TIME: Date;
15
+ DEFAULT_PARENT_RECORD_ID: string;
16
+ DEFAULT_CREATE_EMBEDDING_TEXT_RESULT: {
17
+ text: string;
18
+ };
19
+ userConfig: any;
20
+ constructor({ createEmbeddingTextResult, fakeTime, overridesConfig, suppressConsoleLogs
21
+
22
+
23
+
24
+
25
+ }?: {createEmbeddingTextResult?: any;fakeTime?: Date;overridesConfig?: Partial<SemanticIndexerOptions>;suppressConsoleLogs?: boolean;});
26
+ cleanup(): void;
27
+ updateUserConfig(overrides?: Partial<SemanticIndexerOptions>): any;
28
+ getCurrentTimeString(): string;
29
+ expectCalledWith(stub: SinonStub, callIndex: number, ...expectedArgs: any[]): void;
30
+ expectSuccessCalls(options?: {
31
+ createEmbeddingTextCallCount?: number;
32
+ chunkCallCount?: number;
33
+ embeddingApiCallCount?: number;
34
+ bulkIndexCallCount?: number;
35
+ deleteByQueryCallCount?: number;
36
+ }): void;
37
+ expectIndexedDoc(expectedDoc: any, callIndex?: number): void;
38
+ expectIndexedDocs(expectedDocs: any[], callIndex?: number): void;
39
+ getIndexedDocs(callIndex?: number): any[];
40
+ }
41
+ export declare function createFakeSemanticIndexerConfig(overrideConfig?: Partial<SemanticIndexerOptions>): SemanticIndexerOptions;