@magda/semantic-indexer-framework 5.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/MinioClient.d.ts +7 -0
- package/dist/MinioClient.js +37 -0
- package/dist/MinioClient.js.map +1 -0
- package/dist/SkipError.d.ts +1 -0
- package/dist/SkipError.js +1 -0
- package/dist/SkipError.js.map +1 -0
- package/dist/chunker.d.ts +32 -0
- package/dist/chunker.js +84 -0
- package/dist/chunker.js.map +1 -0
- package/dist/commonYargs.d.ts +43 -0
- package/dist/commonYargs.js +61 -0
- package/dist/commonYargs.js.map +1 -0
- package/dist/createEmbeddingText.d.ts +18 -0
- package/dist/createEmbeddingText.js +1 -0
- package/dist/createEmbeddingText.js.map +1 -0
- package/dist/helpers.d.ts +1 -0
- package/dist/helpers.js +10 -0
- package/dist/helpers.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -0
- package/dist/indexEmbeddingText.d.ts +22 -0
- package/dist/indexEmbeddingText.js +216 -0
- package/dist/indexEmbeddingText.js.map +1 -0
- package/dist/indexSchema.d.ts +105 -0
- package/dist/indexSchema.js +98 -0
- package/dist/indexSchema.js.map +1 -0
- package/dist/onRecordFoundRegistryRecord.d.ts +7 -0
- package/dist/onRecordFoundRegistryRecord.js +47 -0
- package/dist/onRecordFoundRegistryRecord.js.map +1 -0
- package/dist/onRecordFoundStorageObject.d.ts +9 -0
- package/dist/onRecordFoundStorageObject.js +145 -0
- package/dist/onRecordFoundStorageObject.js.map +1 -0
- package/dist/semanticIndexer.d.ts +2 -0
- package/dist/semanticIndexer.js +86 -0
- package/dist/semanticIndexer.js.map +1 -0
- package/dist/semanticIndexerOptions.d.ts +19 -0
- package/dist/semanticIndexerOptions.js +26 -0
- package/dist/semanticIndexerOptions.js.map +1 -0
- package/dist/test/BaseSemanticIndexerTest.d.ts +41 -0
- package/dist/test/BaseSemanticIndexerTest.js +167 -0
- package/dist/test/BaseSemanticIndexerTest.js.map +1 -0
- package/dist/test/chunker.spec.d.ts +1 -0
- package/dist/test/chunker.spec.js +154 -0
- package/dist/test/chunker.spec.js.map +1 -0
- package/dist/test/embeddingApiClient.spec.d.ts +1 -0
- package/dist/test/embeddingApiClient.spec.js +43 -0
- package/dist/test/embeddingApiClient.spec.js.map +1 -0
- package/dist/test/helpers.d.ts +4 -0
- package/dist/test/helpers.js +34 -0
- package/dist/test/helpers.js.map +1 -0
- package/dist/test/indexEmbeddingText.spec.d.ts +1 -0
- package/dist/test/indexEmbeddingText.spec.js +238 -0
- package/dist/test/indexEmbeddingText.spec.js.map +1 -0
- package/dist/test/mockEmbeddingApi.d.ts +1 -0
- package/dist/test/mockEmbeddingApi.js +25 -0
- package/dist/test/mockEmbeddingApi.js.map +1 -0
- package/dist/test/onRecordFoundRegistryRecord.spec.d.ts +1 -0
- package/dist/test/onRecordFoundRegistryRecord.spec.js +155 -0
- package/dist/test/onRecordFoundRegistryRecord.spec.js.map +1 -0
- package/dist/test/onRecordFoundStorageObject.spec.d.ts +1 -0
- package/dist/test/onRecordFoundStorageObject.spec.js +490 -0
- package/dist/test/onRecordFoundStorageObject.spec.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { SemanticIndexerOptions } from "./index.js";
|
|
2
|
+
export declare function createSemanticIndexerMapping(config: SemanticIndexerOptions): {
|
|
3
|
+
indexName: string;
|
|
4
|
+
settings: {
|
|
5
|
+
index: {
|
|
6
|
+
number_of_shards: number;
|
|
7
|
+
number_of_replicas: number;
|
|
8
|
+
};
|
|
9
|
+
"index.knn": boolean;
|
|
10
|
+
};
|
|
11
|
+
mappings: {
|
|
12
|
+
properties: {
|
|
13
|
+
itemType: {
|
|
14
|
+
type: string;
|
|
15
|
+
};
|
|
16
|
+
recordId: {
|
|
17
|
+
type: string;
|
|
18
|
+
};
|
|
19
|
+
aspectId: {
|
|
20
|
+
type: string;
|
|
21
|
+
};
|
|
22
|
+
parentRecordId: {
|
|
23
|
+
type: string;
|
|
24
|
+
};
|
|
25
|
+
fileFormat: {
|
|
26
|
+
type: string;
|
|
27
|
+
};
|
|
28
|
+
subObjectId: {
|
|
29
|
+
type: string;
|
|
30
|
+
};
|
|
31
|
+
subObjectType: {
|
|
32
|
+
type: string;
|
|
33
|
+
};
|
|
34
|
+
index_text_chunk: {
|
|
35
|
+
type: string;
|
|
36
|
+
};
|
|
37
|
+
embedding: {
|
|
38
|
+
method: {
|
|
39
|
+
name: string;
|
|
40
|
+
engine: string;
|
|
41
|
+
parameters: {
|
|
42
|
+
encoder?: {
|
|
43
|
+
name: string;
|
|
44
|
+
parameters: {
|
|
45
|
+
type: string;
|
|
46
|
+
clip: boolean;
|
|
47
|
+
};
|
|
48
|
+
};
|
|
49
|
+
m: number;
|
|
50
|
+
ef_construction: number;
|
|
51
|
+
ef_search: number;
|
|
52
|
+
};
|
|
53
|
+
};
|
|
54
|
+
compression_level?: string;
|
|
55
|
+
type: string;
|
|
56
|
+
dimension: number;
|
|
57
|
+
space_type: string;
|
|
58
|
+
mode: string;
|
|
59
|
+
};
|
|
60
|
+
only_one_index_text_chunk: {
|
|
61
|
+
type: string;
|
|
62
|
+
};
|
|
63
|
+
index_text_chunk_length: {
|
|
64
|
+
type: string;
|
|
65
|
+
};
|
|
66
|
+
index_text_chunk_position: {
|
|
67
|
+
type: string;
|
|
68
|
+
};
|
|
69
|
+
index_text_chunk_overlap: {
|
|
70
|
+
type: string;
|
|
71
|
+
};
|
|
72
|
+
indexerId: {
|
|
73
|
+
type: string;
|
|
74
|
+
};
|
|
75
|
+
createTime: {
|
|
76
|
+
type: string;
|
|
77
|
+
format: string;
|
|
78
|
+
};
|
|
79
|
+
updateTime: {
|
|
80
|
+
type: string;
|
|
81
|
+
format: string;
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
};
|
|
85
|
+
};
|
|
86
|
+
export type ItemType = "registryRecord" | "storageObject";
|
|
87
|
+
export interface SemanticIndexDocument {
|
|
88
|
+
itemType: ItemType;
|
|
89
|
+
recordId: string;
|
|
90
|
+
aspectId?: string;
|
|
91
|
+
parentRecordId?: string;
|
|
92
|
+
fileFormat?: string;
|
|
93
|
+
subObjectId?: string;
|
|
94
|
+
subObjectType?: string;
|
|
95
|
+
index_text_chunk: string;
|
|
96
|
+
embedding: number[];
|
|
97
|
+
only_one_index_text_chunk: boolean;
|
|
98
|
+
index_text_chunk_length: number;
|
|
99
|
+
index_text_chunk_position: number;
|
|
100
|
+
index_text_chunk_overlap: number;
|
|
101
|
+
indexerId: string;
|
|
102
|
+
createTime: string;
|
|
103
|
+
updateTime: string;
|
|
104
|
+
}
|
|
105
|
+
export declare function buildSemanticIndexDocument(params: SemanticIndexDocument): SemanticIndexDocument;
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
const SEMANTIC_INDEX_VERSION = 1;
|
|
2
|
+
export function createSemanticIndexerMapping(config) {
|
|
3
|
+
const indexConfig = config.argv.semanticIndexerConfig;
|
|
4
|
+
const knnVectorFieldConfig = indexConfig.knnVectorFieldConfig;
|
|
5
|
+
if (indexConfig.indexVersion !== SEMANTIC_INDEX_VERSION) {
|
|
6
|
+
throw new Error(`Index version mismatch. Expected ${SEMANTIC_INDEX_VERSION}, got ${indexConfig.indexVersion}`);
|
|
7
|
+
}
|
|
8
|
+
if (indexConfig.knnVectorFieldConfig.compressionLevel &&
|
|
9
|
+
indexConfig.knnVectorFieldConfig.encoder) {
|
|
10
|
+
throw new Error("compressionLevel and encoder cannot be used together");
|
|
11
|
+
}
|
|
12
|
+
return {
|
|
13
|
+
indexName: indexConfig.fullIndexName,
|
|
14
|
+
settings: {
|
|
15
|
+
index: {
|
|
16
|
+
number_of_shards: indexConfig.numberOfShards,
|
|
17
|
+
number_of_replicas: indexConfig.numberOfReplicas
|
|
18
|
+
},
|
|
19
|
+
"index.knn": true
|
|
20
|
+
},
|
|
21
|
+
mappings: {
|
|
22
|
+
properties: {
|
|
23
|
+
itemType: { type: "keyword" },
|
|
24
|
+
recordId: { type: "keyword" },
|
|
25
|
+
aspectId: { type: "keyword" },
|
|
26
|
+
parentRecordId: { type: "keyword" },
|
|
27
|
+
fileFormat: { type: "keyword" },
|
|
28
|
+
subObjectId: { type: "keyword" },
|
|
29
|
+
subObjectType: { type: "keyword" },
|
|
30
|
+
index_text_chunk: { type: "keyword" },
|
|
31
|
+
embedding: {
|
|
32
|
+
type: "knn_vector",
|
|
33
|
+
dimension: knnVectorFieldConfig.dimension,
|
|
34
|
+
space_type: knnVectorFieldConfig.spaceType,
|
|
35
|
+
mode: knnVectorFieldConfig.mode,
|
|
36
|
+
...(knnVectorFieldConfig.compressionLevel ?
|
|
37
|
+
{
|
|
38
|
+
compression_level: knnVectorFieldConfig.compressionLevel
|
|
39
|
+
} :
|
|
40
|
+
{}),
|
|
41
|
+
method: {
|
|
42
|
+
name: "hnsw",
|
|
43
|
+
engine: "faiss",
|
|
44
|
+
parameters: {
|
|
45
|
+
m: knnVectorFieldConfig.m,
|
|
46
|
+
ef_construction: knnVectorFieldConfig.efConstruction,
|
|
47
|
+
ef_search: knnVectorFieldConfig.efSearch,
|
|
48
|
+
...(knnVectorFieldConfig.encoder ?
|
|
49
|
+
{
|
|
50
|
+
encoder: {
|
|
51
|
+
name: knnVectorFieldConfig.encoder.name,
|
|
52
|
+
parameters: {
|
|
53
|
+
type: knnVectorFieldConfig.encoder.
|
|
54
|
+
type,
|
|
55
|
+
clip: knnVectorFieldConfig.encoder.
|
|
56
|
+
clip
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
} :
|
|
60
|
+
{})
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
only_one_index_text_chunk: { type: "boolean" },
|
|
65
|
+
index_text_chunk_length: { type: "integer" },
|
|
66
|
+
index_text_chunk_position: { type: "integer" },
|
|
67
|
+
index_text_chunk_overlap: { type: "integer" },
|
|
68
|
+
indexerId: { type: "keyword" },
|
|
69
|
+
createTime: { type: "date", format: "strict_date_time" },
|
|
70
|
+
updateTime: { type: "date", format: "strict_date_time" }
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
export function buildSemanticIndexDocument(params) {
|
|
76
|
+
return {
|
|
77
|
+
itemType: params.itemType,
|
|
78
|
+
recordId: params.recordId,
|
|
79
|
+
...(params.aspectId ? { aspectId: params.aspectId } : {}),
|
|
80
|
+
...(params.parentRecordId ?
|
|
81
|
+
{ parentRecordId: params.parentRecordId } :
|
|
82
|
+
{}),
|
|
83
|
+
...(params.fileFormat ? { fileFormat: params.fileFormat } : {}),
|
|
84
|
+
...(params.subObjectId ? { subObjectId: params.subObjectId } : {}),
|
|
85
|
+
...(params.subObjectType ?
|
|
86
|
+
{ subObjectType: params.subObjectType } :
|
|
87
|
+
{}),
|
|
88
|
+
index_text_chunk: params.index_text_chunk,
|
|
89
|
+
embedding: params.embedding,
|
|
90
|
+
only_one_index_text_chunk: params.only_one_index_text_chunk,
|
|
91
|
+
index_text_chunk_length: params.index_text_chunk_length,
|
|
92
|
+
index_text_chunk_position: params.index_text_chunk_position,
|
|
93
|
+
index_text_chunk_overlap: params.index_text_chunk_overlap,
|
|
94
|
+
indexerId: params.indexerId,
|
|
95
|
+
createTime: params.createTime,
|
|
96
|
+
updateTime: params.updateTime
|
|
97
|
+
};
|
|
98
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"indexSchema.js","sourceRoot":"","sources":["../src/indexSchema.ts"],"names":[],"mappings":"AAEA,MAAM,sBAAsB,GAAG,CAAC,CAAC;AAEjC,MAAM,UAAU,4BAA4B,CAAC,MAA8B;IACvE,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,qBAAqB,CAAC;IACtD,MAAM,oBAAoB,GAAG,WAAW,CAAC,oBAAoB,CAAC;IAE9D,IAAI,WAAW,CAAC,YAAY,KAAK,sBAAsB,EAAE,CAAC;QACtD,MAAM,IAAI,KAAK,CACX,oCAAoC,sBAAsB,SAAS,WAAW,CAAC,YAAY,EAAE,CAChG,CAAC;IACN,CAAC;IAED,IACI,WAAW,CAAC,oBAAoB,CAAC,gBAAgB;QACjD,WAAW,CAAC,oBAAoB,CAAC,OAAO,EAC1C,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC5E,CAAC;IAED,OAAO;QACH,SAAS,EAAE,WAAW,CAAC,aAAa;QACpC,QAAQ,EAAE;YACN,KAAK,EAAE;gBACH,gBAAgB,EAAE,WAAW,CAAC,cAAc;gBAC5C,kBAAkB,EAAE,WAAW,CAAC,gBAAgB;aACnD;YACD,WAAW,EAAE,IAAI;SACpB;QACD,QAAQ,EAAE;YACN,UAAU,EAAE;gBACR,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7B,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7B,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7B,cAAc,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBACnC,UAAU,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC/B,WAAW,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAChC,aAAa,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAClC,gBAAgB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBACrC,SAAS,EAAE;oBACP,IAAI,EAAE,YAAY;oBAClB,SAAS,EAAE,oBAAoB,CAAC,SAAS;oBACzC,UAAU,EAAE,oBAAoB,CAAC,SAAS;oBAC1C,IAAI,EAAE,oBAAoB,CAAC,IAAI;oBAC/B,GAAG,CAAC,oBAAoB,CAAC,gBAAgB;wBACrC,CAAC,CAAC;4BACI,iBAAiB,EACb,oBAAoB,CAAC,gBAAgB;yBAC5C;wBACH,CAAC,CAAC,EAAE,CAAC;oBACT,MAAM,EAAE;wBACJ,IAAI,EAAE,MAAM;wBACZ,MAAM,EAAE,OAAO;wBACf,UAAU,EAAE;4BACR,CAAC,EAAE,oBAAoB,CAAC,CAAC;4BACzB,eAAe,EACX,oBAAoB,CAAC,cAAc;4BACvC,SAAS,EAAE,oBAAoB,CAAC,QAAQ;4BACxC,GAAG,CAAC,oBAAoB,CAAC,OAAO;gCAC5B,CAAC,CAAC;oCACI,OAAO,EAAE;wCACL,IAAI,EACA,oBAAoB,CAAC,OAAO,CAAC,IAAI;wCACrC,UAAU,EAAE;4CACR,IAAI,EACA,oBAAoB,CAAC,OAAO;iDACvB,IAAI;4CACb,IAAI,EACA,oBAAoB,CAAC,OAAO;iDACvB,IAAI;yCAChB;qCACJ;iCACJ;gCACH,CAAC,CAAC,EAAE,CAAC;yBACZ;qBACJ;iBACJ;gBACD,yBAAyB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC9C,uBAAuB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC5C,yBAAyB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC9C,wBAAwB,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC7C,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;gBAC9B,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,kBAAkB,EAAE;gBACxD,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,kBAAkB,EAAE;aAC3D;SACJ;KACJ,CAAC;AACN,CAAC;AAuBD,MAAM,UAAU,0BAA0B,CACtC,MAA6B;IAE7B,OAAO;QACH,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACzD,GAAG,CAAC,MAAM,CAAC,cAAc;YACrB,CAAC,CAAC,EAAE,cAAc,EAAE,MAAM,CAAC,cAAc,EAAE;YAC3C,CAAC,CAAC,EAAE,CAAC;QACT,GAAG,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC/D,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAClE,GAAG,CAAC,MAAM,CAAC,aAAa;YACpB,CAAC,CAAC,EAAE,aAAa,EAAE,MAAM,CAAC,aAAa,EAAE;YACzC,CAAC,CAAC,EAAE,CAAC;QACT,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,yBAAyB,EAAE,MAAM,CAAC,yBAAyB;QAC3D,uBAAuB,EAAE,MAAM,CAAC,uBAAuB;QACvD,yBAAyB,EAAE,MAAM,CAAC,yBAAyB;QAC3D,wBAAwB,EAAE,MAAM,CAAC,wBAAwB;QACzD,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,UAAU,EAAE,MAAM,CAAC,UAAU;KAChC,CAAC;AACN,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { onRecordFoundType } from "@magda/minion-framework/dist/MinionOptions.js";
|
|
2
|
+
import { Chunker } from "./chunker.js";
|
|
3
|
+
import EmbeddingApiClient from "@magda/typescript-common/dist/EmbeddingApiClient.js";
|
|
4
|
+
import OpensearchApiClient from "@magda/typescript-common/dist/OpensearchApiClient.js";
|
|
5
|
+
import SemanticIndexerOptions from "./semanticIndexerOptions.js";
|
|
6
|
+
import Registry from "@magda/typescript-common/dist/registry/AuthorizedRegistryClient.js";
|
|
7
|
+
export declare const onRecordFoundRegistryRecord: (options: SemanticIndexerOptions, chunker: Chunker, embeddingApiClient: EmbeddingApiClient, opensearchApiClient: OpensearchApiClient, registryReadonlyClient: Registry) => onRecordFoundType;
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { indexEmbeddingText } from "./indexEmbeddingText.js";
|
|
2
|
+
import { SkipError } from "./SkipError.js";
|
|
3
|
+
// The onRecordFound function passed to minion sdk to handle registry record records
|
|
4
|
+
export const onRecordFoundRegistryRecord = (options, chunker, embeddingApiClient, opensearchApiClient, registryReadonlyClient) => {
|
|
5
|
+
return async (record, _registry) => {
|
|
6
|
+
try {
|
|
7
|
+
if (!record.aspects ||
|
|
8
|
+
!options.aspects?.every((aspect) => aspect in record.aspects)) {
|
|
9
|
+
return;
|
|
10
|
+
}
|
|
11
|
+
let embeddingText;
|
|
12
|
+
try {
|
|
13
|
+
embeddingText = await options.createEmbeddingText({
|
|
14
|
+
record,
|
|
15
|
+
format: null,
|
|
16
|
+
filePath: null,
|
|
17
|
+
url: null,
|
|
18
|
+
readonlyRegistry: registryReadonlyClient
|
|
19
|
+
});
|
|
20
|
+
if (!embeddingText.text && !embeddingText.subObjects) {
|
|
21
|
+
throw new SkipError("User-provided createEmbeddingText function returned no text or subObjects");
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
catch (err) {
|
|
25
|
+
throw new SkipError(`Error in user-provided createEmbeddingText function: ${err.message}`);
|
|
26
|
+
}
|
|
27
|
+
await indexEmbeddingText({
|
|
28
|
+
options,
|
|
29
|
+
chunker,
|
|
30
|
+
embeddingApiClient,
|
|
31
|
+
opensearchApiClient,
|
|
32
|
+
embeddingText,
|
|
33
|
+
metadata: {
|
|
34
|
+
recordId: record.id,
|
|
35
|
+
fileFormat: record.aspects["dataset-format"]?.format
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
if (err instanceof SkipError) {
|
|
41
|
+
console.warn(`Skipping record ${record.id} because:`, err.message);
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
throw err;
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"onRecordFoundRegistryRecord.js","sourceRoot":"","sources":["../src/onRecordFoundRegistryRecord.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAE7D,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAG3C,oFAAoF;AACpF,MAAM,CAAC,MAAM,2BAA2B,GAAG,CACvC,OAA+B,EAC/B,OAAgB,EAChB,kBAAsC,EACtC,mBAAwC,EACxC,sBAAgC,EACf,EAAE;IACnB,OAAO,KAAK,EAAE,MAAc,EAAE,SAAS,EAAE,EAAE;QACvC,IAAI,CAAC;YACD,IACI,CAAC,MAAM,CAAC,OAAO;gBACf,CAAC,OAAO,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,IAAI,MAAM,CAAC,OAAO,CAAC,EAC/D,CAAC;gBACC,OAAO;YACX,CAAC;YAED,IAAI,aAAa,CAAC;YAClB,IAAI,CAAC;gBACD,aAAa,GAAG,MAAM,OAAO,CAAC,mBAAmB,CAAC;oBAC9C,MAAM;oBACN,MAAM,EAAE,IAAI;oBACZ,QAAQ,EAAE,IAAI;oBACd,GAAG,EAAE,IAAI;oBACT,gBAAgB,EAAE,sBAAsB;iBAC3C,CAAC,CAAC;gBAEH,IAAI,CAAC,aAAa,CAAC,IAAI,IAAI,CAAC,aAAa,CAAC,UAAU,EAAE,CAAC;oBACnD,MAAM,IAAI,SAAS,CACf,2EAA2E,CAC9E,CAAC;gBACN,CAAC;YACL,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACX,MAAM,IAAI,SAAS,CACf,wDACK,GAAa,CAAC,OACnB,EAAE,CACL,CAAC;YACN,CAAC;YAED,MAAM,kBAAkB,CAAC;gBACrB,OAAO;gBACP,OAAO;gBACP,kBAAkB;gBAClB,mBAAmB;gBACnB,aAAa;gBACb,QAAQ,EAAE;oBACN,QAAQ,EAAE,MAAM,CAAC,EAAE;oBACnB,UAAU,EAAE,MAAM,CAAC,OAAO,CAAC,gBAAgB,CAAC,EAAE,MAAM;iBACvD;aACJ,CAAC,CAAC;QACP,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,IAAI,GAAG,YAAY,SAAS,EAAE,CAAC;gBAC3B,OAAO,CAAC,IAAI,CACR,mBAAmB,MAAM,CAAC,EAAE,WAAW,EACvC,GAAG,CAAC,OAAO,CACd,CAAC;gBACF,OAAO;YACX,CAAC;YACD,MAAM,GAAG,CAAC;QACd,CAAC;IACL,CAAC,CAAC;AACN,CAAC,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { onRecordFoundType } from "@magda/minion-framework/dist/MinionOptions.js";
|
|
2
|
+
import { Chunker } from "./chunker.js";
|
|
3
|
+
import EmbeddingApiClient from "@magda/typescript-common/dist/EmbeddingApiClient.js";
|
|
4
|
+
import OpensearchApiClient from "@magda/typescript-common/dist/OpensearchApiClient.js";
|
|
5
|
+
import SemanticIndexerOptions from "./semanticIndexerOptions.js";
|
|
6
|
+
import Registry from "@magda/typescript-common/dist/registry/AuthorizedRegistryClient.js";
|
|
7
|
+
import { MinioClient } from "./MinioClient.js";
|
|
8
|
+
export declare const onRecordFoundStorageObject: (options: SemanticIndexerOptions, chunker: Chunker, embeddingApiClient: EmbeddingApiClient, opensearchApiClient: OpensearchApiClient, minioClient: MinioClient, registryReadonlyClient: Registry) => onRecordFoundType;
|
|
9
|
+
export declare function getParentRecordId(distributionId: string, registry: Registry): Promise<string | null>;
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { indexEmbeddingText } from "./indexEmbeddingText.js";
|
|
2
|
+
import retry from "@magda/typescript-common/dist/retry.js";
|
|
3
|
+
import fetch from "node-fetch";
|
|
4
|
+
import * as fs from "fs";
|
|
5
|
+
import { tmpdir } from "os";
|
|
6
|
+
import { join } from "path";
|
|
7
|
+
import { v4 as uuidv4 } from "uuid";
|
|
8
|
+
import { SkipError } from "./SkipError.js";
|
|
9
|
+
import urijs from "urijs";
|
|
10
|
+
import { pipeline } from "stream/promises";
|
|
11
|
+
import ServerError from "@magda/typescript-common/dist/ServerError.js";
|
|
12
|
+
import { deleteTempFile } from "./helpers.js";
|
|
13
|
+
// The onRecordFound function passed to minion sdk to handle storage object records
|
|
14
|
+
export const onRecordFoundStorageObject = (options, chunker, embeddingApiClient, opensearchApiClient, minioClient, registryReadonlyClient) => {
|
|
15
|
+
return async (dist, registry) => {
|
|
16
|
+
try {
|
|
17
|
+
const datasetFormat = dist.aspects?.["dataset-format"]?.format;
|
|
18
|
+
const dcatDist = dist.aspects?.["dcat-distribution-strings"] || {};
|
|
19
|
+
const { format: dcatFormat, downloadURL, accessURL } = dcatDist;
|
|
20
|
+
const fileDownloadURL = downloadURL || accessURL;
|
|
21
|
+
let format = datasetFormat || dcatFormat;
|
|
22
|
+
if (!format && fileDownloadURL) {
|
|
23
|
+
format = new urijs(fileDownloadURL).suffix().toUpperCase();
|
|
24
|
+
}
|
|
25
|
+
// filler record
|
|
26
|
+
if (!format ||
|
|
27
|
+
!fileDownloadURL ||
|
|
28
|
+
!options.formatTypes?.some((f) => format.toLowerCase().includes(f.toLowerCase()))) {
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
let embeddingText;
|
|
32
|
+
let filePath = null;
|
|
33
|
+
const parentRecordId = await getParentRecordId(dist.id, registryReadonlyClient);
|
|
34
|
+
try {
|
|
35
|
+
try {
|
|
36
|
+
if (options.autoDownloadFile === undefined ||
|
|
37
|
+
options.autoDownloadFile) {
|
|
38
|
+
filePath = await downloadFileWithRetry(fileDownloadURL, minioClient);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
catch (err) {
|
|
42
|
+
throw new SkipError(`Error in downloading file: ${err.message}`);
|
|
43
|
+
}
|
|
44
|
+
try {
|
|
45
|
+
embeddingText = await options.createEmbeddingText({
|
|
46
|
+
record: dist,
|
|
47
|
+
format: format,
|
|
48
|
+
filePath,
|
|
49
|
+
url: fileDownloadURL,
|
|
50
|
+
readonlyRegistry: registryReadonlyClient
|
|
51
|
+
});
|
|
52
|
+
if (!embeddingText.text && !embeddingText.subObjects) {
|
|
53
|
+
throw new SkipError("User-provided createEmbeddingText function returned no text or subObjects");
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
catch (err) {
|
|
57
|
+
throw new SkipError(`Error in user-provided createEmbeddingText function: ${err.message}`);
|
|
58
|
+
}
|
|
59
|
+
} finally
|
|
60
|
+
{
|
|
61
|
+
if (filePath) {
|
|
62
|
+
await deleteTempFile(filePath);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
await indexEmbeddingText({
|
|
66
|
+
options,
|
|
67
|
+
chunker,
|
|
68
|
+
embeddingApiClient,
|
|
69
|
+
opensearchApiClient,
|
|
70
|
+
embeddingText,
|
|
71
|
+
metadata: {
|
|
72
|
+
recordId: dist.id,
|
|
73
|
+
parentRecordId: parentRecordId,
|
|
74
|
+
aspectId: dist.aspects["dataset-format"]?.id,
|
|
75
|
+
fileFormat: format
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
catch (err) {
|
|
80
|
+
if (err instanceof SkipError) {
|
|
81
|
+
console.warn(`Skipping distribution ${dist.id} because:`, err.message);
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
throw err;
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
};
|
|
88
|
+
async function downloadFileWithRetry(url, minioClient) {
|
|
89
|
+
return retry(() => downloadFile(url, minioClient), 1, 5, (err, retries) => {});
|
|
90
|
+
}
|
|
91
|
+
async function downloadFile(url, minioClient) {
|
|
92
|
+
const uri = urijs(url);
|
|
93
|
+
if (uri.protocol() === "magda" && uri.hostname() === "storage-api") {
|
|
94
|
+
try {
|
|
95
|
+
return await minioClient.downloadFile(url);
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
throw new SkipError(`Failed to download file from Minio: ${err.message}`);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
let response;
|
|
102
|
+
try {
|
|
103
|
+
response = await fetch(url);
|
|
104
|
+
}
|
|
105
|
+
catch (err) {
|
|
106
|
+
throw new SkipError(`Failed to download file because network error`);
|
|
107
|
+
}
|
|
108
|
+
if (!response.ok) {
|
|
109
|
+
throw new SkipError(`Failed to download file because HTTP error ${response.status}`);
|
|
110
|
+
}
|
|
111
|
+
if (!response.body) {
|
|
112
|
+
throw new SkipError("No response body to write to file");
|
|
113
|
+
}
|
|
114
|
+
const tempDir = tmpdir();
|
|
115
|
+
const tempFileName = `${uuidv4()}`;
|
|
116
|
+
const suffix = new urijs(url).suffix();
|
|
117
|
+
const tempFilePath = join(tempDir, `${tempFileName}.${suffix}`);
|
|
118
|
+
try {
|
|
119
|
+
const writeStream = fs.createWriteStream(tempFilePath);
|
|
120
|
+
await pipeline(response.body, writeStream);
|
|
121
|
+
return tempFilePath;
|
|
122
|
+
}
|
|
123
|
+
catch (err) {
|
|
124
|
+
await deleteTempFile(tempFilePath);
|
|
125
|
+
throw new SkipError(`Failed to write file`);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
export async function getParentRecordId(distributionId, registry) {
|
|
129
|
+
try {
|
|
130
|
+
const result = await registry.getRecords(["dataset-distributions"], undefined, undefined, true, undefined, ["dataset-distributions.distributions:<|" + distributionId]);
|
|
131
|
+
if (result instanceof ServerError) {
|
|
132
|
+
console.error(`Failed to get parent record id: ${result.message}`);
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
if (!("records" in result)) {
|
|
136
|
+
console.error(`Failed to get parent record id`);
|
|
137
|
+
return null;
|
|
138
|
+
}
|
|
139
|
+
return result.records[0]?.id || null;
|
|
140
|
+
}
|
|
141
|
+
catch (e) {
|
|
142
|
+
console.error(`Unexpected error when getting parent record id`, e);
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"onRecordFoundStorageObject.js","sourceRoot":"","sources":["../src/onRecordFoundStorageObject.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAG7D,OAAO,KAAK,MAAM,sCAAsC,CAAC;AACzD,OAAO,KAAK,MAAM,YAAY,CAAC;AAC/B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,MAAM,EAAE,MAAM,IAAI,CAAC;AAC5B,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAE3C,OAAO,WAAW,MAAM,4CAA4C,CAAC;AAErE,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,mFAAmF;AACnF,MAAM,CAAC,MAAM,0BAA0B,GAAG,CACtC,OAA+B,EAC/B,OAAgB,EAChB,kBAAsC,EACtC,mBAAwC,EACxC,WAAwB,EACxB,sBAAgC,EACf,EAAE;IACnB,OAAO,KAAK,EAAE,IAAY,EAAE,QAAQ,EAAE,EAAE;QACpC,IAAI,CAAC;YACD,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;YAC/D,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,2BAA2B,CAAC,IAAI,EAAE,CAAC;YACnE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,GAAG,QAAQ,CAAC;YAChE,MAAM,eAAe,GAAG,WAAW,IAAI,SAAS,CAAC;YACjD,IAAI,MAAM,GAAG,aAAa,IAAI,UAAU,CAAC;YACzC,IAAI,CAAC,MAAM,IAAI,eAAe,EAAE,CAAC;gBAC7B,MAAM,GAAG,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC,MAAM,EAAE,CAAC,WAAW,EAAE,CAAC;YAC/D,CAAC;YAED,gBAAgB;YAChB,IACI,CAAC,MAAM;gBACP,CAAC,eAAe;gBAChB,CAAC,OAAO,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAC7B,MAAM,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CACjD,EACH,CAAC;gBACC,OAAO;YACX,CAAC;YAED,IAAI,aAA4B,CAAC;YACjC,IAAI,QAAQ,GAAkB,IAAI,CAAC;YAEnC,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAC1C,IAAI,CAAC,EAAE,EACP,sBAAsB,CACzB,CAAC;YACF,IAAI,CAAC;gBACD,IAAI,CAAC;oBACD,IACI,OAAO,CAAC,gBAAgB,KAAK,SAAS;wBACtC,OAAO,CAAC,gBAAgB,EAC1B,CAAC;wBACC,QAAQ,GAAG,MAAM,qBAAqB,CAClC,eAAe,EACf,WAAW,CACd,CAAC;oBACN,CAAC;gBACL,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACX,MAAM,IAAI,SAAS,CACf,8BAA+B,GAAa,CAAC,OAAO,EAAE,CACzD,CAAC;gBACN,CAAC;gBACD,IAAI,CAAC;oBACD,aAAa,GAAG,MAAM,OAAO,CAAC,mBAAmB,CAAC;wBAC9C,MAAM,EAAE,IAAI;wBACZ,MAAM,EAAE,MAAM;wBACd,QAAQ;wBACR,GAAG,EAAE,eAAe;wBACpB,gBAAgB,EAAE,sBAAsB;qBAC3C,CAAC,CAAC;oBAEH,IAAI,CAAC,aAAa,CAAC,IAAI,IAAI,CAAC,aAAa,CAAC,UAAU,EAAE,CAAC;wBACnD,MAAM,IAAI,SAAS,CACf,2EAA2E,CAC9E,CAAC;oBACN,CAAC;gBACL,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACX,MAAM,IAAI,SAAS,CACf,wDACK,GAAa,CAAC,OACnB,EAAE,CACL,CAAC;gBACN,CAAC;YACL,CAAC;oBAAS,CAAC;gBACP,IAAI,QAAQ,EAAE,CAAC;oBACX,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAC;gBACnC,CAAC;YACL,CAAC;YAED,MAAM,kBAAkB,CAAC;gBACrB,OAAO;gBACP,OAAO;gBACP,kBAAkB;gBAClB,mBAAmB;gBACnB,aAAa;gBACb,QAAQ,EAAE;oBACN,QAAQ,EAAE,IAAI,CAAC,EAAE;oBACjB,cAAc,EAAE,cAAc;oBAC9B,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,EAAE,EAAE;oBAC5C,UAAU,EAAE,MAAM;iBACrB;aACJ,CAAC,CAAC;QACP,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,IAAI,GAAG,YAAY,SAAS,EAAE,CAAC;gBAC3B,OAAO,CAAC,IAAI,CACR,yBAAyB,IAAI,CAAC,EAAE,WAAW,EAC3C,GAAG,CAAC,OAAO,CACd,CAAC;gBACF,OAAO;YACX,CAAC;YACD,MAAM,GAAG,CAAC;QACd,CAAC;IACL,CAAC,CAAC;AACN,CAAC,CAAC;AAEF,KAAK,UAAU,qBAAqB,CAChC,GAAW,EACX,WAAwB;IAExB,OAAO,KAAK,CACR,GAAG,EAAE,CAAC,YAAY,CAAC,GAAG,EAAE,WAAW,CAAC,EACpC,CAAC,EACD,CAAC,EACD,CAAC,GAAG,EAAE,OAAO,EAAE,EAAE,GAAE,CAAC,CACvB,CAAC;AACN,CAAC;AAED,KAAK,UAAU,YAAY,CACvB,GAAW,EACX,WAAwB;IAExB,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;IACvB,IAAI,GAAG,CAAC,QAAQ,EAAE,KAAK,OAAO,IAAI,GAAG,CAAC,QAAQ,EAAE,KAAK,aAAa,EAAE,CAAC;QACjE,IAAI,CAAC;YACD,OAAO,MAAM,WAAW,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,MAAM,IAAI,SAAS,CACf,uCAAwC,GAAa,CAAC,OAAO,EAAE,CAClE,CAAC;QACN,CAAC;IACL,CAAC;IAED,IAAI,QAAQ,CAAC;IACb,IAAI,CAAC;QACD,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,MAAM,IAAI,SAAS,CAAC,+CAA+C,CAAC,CAAC;IACzE,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACf,MAAM,IAAI,SAAS,CACf,8CAA8C,QAAQ,CAAC,MAAM,EAAE,CAClE,CAAC;IACN,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACjB,MAAM,IAAI,SAAS,CAAC,mCAAmC,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC;IACzB,MAAM,YAAY,GAAG,GAAG,MAAM,EAAE,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC;IACvC,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,YAAY,IAAI,MAAM,EAAE,CAAC,CAAC;IAEhE,IAAI,CAAC;QACD,MAAM,WAAW,GAAG,EAAE,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;QACvD,MAAM,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAC3C,OAAO,YAAY,CAAC;IACxB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,MAAM,cAAc,CAAC,YAAY,CAAC,CAAC;QACnC,MAAM,IAAI,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAChD,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACnC,cAAsB,EACtB,QAAkB;IAElB,IAAI,CAAC;QACD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,UAAU,CACpC,CAAC,uBAAuB,CAAC,EACzB,SAAS,EACT,SAAS,EACT,IAAI,EACJ,SAAS,EACT,CAAC,wCAAwC,GAAG,cAAc,CAAC,CAC9D,CAAC;QAEF,IAAI,MAAM,YAAY,WAAW,EAAE,CAAC;YAChC,OAAO,CAAC,KAAK,CAAC,mCAAmC,MAAM,CAAC,OAAO,EAAE,CAAC,CAAC;YACnE,OAAO,IAAI,CAAC;QAChB,CAAC;QACD,IAAI,CAAC,CAAC,SAAS,IAAI,MAAM,CAAC,EAAE,CAAC;YACzB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO,IAAI,CAAC;QAChB,CAAC;QAED,OAAO,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,IAAI,IAAI,CAAC;IACzC,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CAAC,gDAAgD,EAAE,CAAC,CAAC,CAAC;QACnE,OAAO,IAAI,CAAC;IAChB,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import minion from "@magda/minion-framework/dist/index.js";
|
|
2
|
+
import { Chunker, UserDefinedChunkStrategy, RecursiveChunkStrategy } from "./chunker.js";
|
|
3
|
+
import { onRecordFoundRegistryRecord } from "./onRecordFoundRegistryRecord.js";
|
|
4
|
+
import { onRecordFoundStorageObject } from "./onRecordFoundStorageObject.js";
|
|
5
|
+
import { createSemanticIndexerMapping } from "./indexSchema.js";
|
|
6
|
+
import EmbeddingApiClient from "@magda/typescript-common/dist/EmbeddingApiClient.js";
|
|
7
|
+
import OpensearchApiClient from "@magda/typescript-common/dist/OpensearchApiClient.js";
|
|
8
|
+
import { validateSemanticIndexerOptions } from "./semanticIndexerOptions.js";
|
|
9
|
+
import retry from "@magda/typescript-common/dist/retry.js";
|
|
10
|
+
import { MinioClient } from "./MinioClient.js";
|
|
11
|
+
import { MAGDA_SYSTEM_ID } from "@magda/typescript-common/dist/registry/TenantConsts.js";
|
|
12
|
+
import Registry from "@magda/typescript-common/dist/registry/AuthorizedRegistryClient.js";
|
|
13
|
+
// Main function for semantic indexer
|
|
14
|
+
export default async function semanticIndexer(userConfig) {
|
|
15
|
+
try {
|
|
16
|
+
validateSemanticIndexerOptions(userConfig);
|
|
17
|
+
const semanticIndexerConfig = userConfig.argv.semanticIndexerConfig;
|
|
18
|
+
const opensearchApiClient = await retry(() => OpensearchApiClient.getOpensearchApiClient({
|
|
19
|
+
url: userConfig.argv.opensearchApiURL
|
|
20
|
+
}), 5, 5, (e, left) => console.error(`Opensearch connection failed, remaining retries: ${left}, error:`, e.message));
|
|
21
|
+
const embeddingApiClient = await retry(() => Promise.resolve(new EmbeddingApiClient({
|
|
22
|
+
baseApiUrl: userConfig.argv.embeddingApiURL
|
|
23
|
+
})), 5, 5, (e, left) => console.error(`Embedding API connection failed, remaining retries: ${left}, error:`, e.message));
|
|
24
|
+
const registryReadonlyClient = new Registry({
|
|
25
|
+
baseUrl: userConfig.argv.registryReadonlyURL,
|
|
26
|
+
jwtSecret: userConfig.argv.jwtSecret,
|
|
27
|
+
userId: userConfig.argv.userId,
|
|
28
|
+
maxRetries: 3,
|
|
29
|
+
tenantId: MAGDA_SYSTEM_ID
|
|
30
|
+
});
|
|
31
|
+
if (!(await opensearchApiClient.indexExists(semanticIndexerConfig.fullIndexName))) {
|
|
32
|
+
const indexDefinition = createSemanticIndexerMapping(userConfig);
|
|
33
|
+
await opensearchApiClient.createIndex(indexDefinition);
|
|
34
|
+
}
|
|
35
|
+
const chunkStrategy = userConfig.chunkStrategy ?
|
|
36
|
+
new UserDefinedChunkStrategy(userConfig.chunkStrategy) :
|
|
37
|
+
new RecursiveChunkStrategy(semanticIndexerConfig.chunkSizeLimit ||
|
|
38
|
+
userConfig.chunkSizeLimit, semanticIndexerConfig.overlap || userConfig.overlap);
|
|
39
|
+
const chunker = new Chunker(chunkStrategy);
|
|
40
|
+
let onRecordFound;
|
|
41
|
+
let minionOptions;
|
|
42
|
+
if (userConfig.itemType === "registryRecord") {
|
|
43
|
+
onRecordFound = onRecordFoundRegistryRecord(userConfig, chunker, embeddingApiClient, opensearchApiClient, registryReadonlyClient);
|
|
44
|
+
minionOptions = {
|
|
45
|
+
argv: userConfig.argv,
|
|
46
|
+
id: userConfig.id,
|
|
47
|
+
aspects: userConfig.aspects || [],
|
|
48
|
+
optionalAspects: userConfig.optionalAspects || [],
|
|
49
|
+
writeAspectDefs: [],
|
|
50
|
+
async: true,
|
|
51
|
+
dereference: true,
|
|
52
|
+
includeEvents: false,
|
|
53
|
+
includeRecords: true,
|
|
54
|
+
onRecordFound: onRecordFound
|
|
55
|
+
};
|
|
56
|
+
} else
|
|
57
|
+
if (userConfig.itemType === "storageObject") {
|
|
58
|
+
const minioClient = new MinioClient(userConfig.argv.minioConfig, userConfig.argv.minioAccessKey, userConfig.argv.minioSecretKey);
|
|
59
|
+
onRecordFound = onRecordFoundStorageObject(userConfig, chunker, embeddingApiClient, opensearchApiClient, minioClient, registryReadonlyClient);
|
|
60
|
+
minionOptions = {
|
|
61
|
+
argv: userConfig.argv,
|
|
62
|
+
id: userConfig.id,
|
|
63
|
+
aspects: ["dcat-distribution-strings", "dataset-format"],
|
|
64
|
+
optionalAspects: [],
|
|
65
|
+
writeAspectDefs: [],
|
|
66
|
+
async: true,
|
|
67
|
+
dereference: false,
|
|
68
|
+
includeEvents: false,
|
|
69
|
+
includeRecords: true,
|
|
70
|
+
onRecordFound: onRecordFound,
|
|
71
|
+
maxRetries: 3
|
|
72
|
+
};
|
|
73
|
+
} else
|
|
74
|
+
{
|
|
75
|
+
throw new Error("Invalid itemType");
|
|
76
|
+
}
|
|
77
|
+
minion(minionOptions).catch((e) => {
|
|
78
|
+
console.error("Minion execution error: " + e.message, e);
|
|
79
|
+
process.exit(1);
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
catch (e) {
|
|
83
|
+
console.error("semanticIndexer initialization error: " + e.message, e);
|
|
84
|
+
process.exit(1);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semanticIndexer.js","sourceRoot":"","sources":["../src/semanticIndexer.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,qCAAqC,CAAC;AACzD,OAAO,EACH,OAAO,EACP,wBAAwB,EACxB,sBAAsB,EACzB,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,2BAA2B,EAAE,MAAM,kCAAkC,CAAC;AAC/E,OAAO,EAAE,0BAA0B,EAAE,MAAM,iCAAiC,CAAC;AAC7E,OAAO,EAAE,4BAA4B,EAAE,MAAM,kBAAkB,CAAC;AAChE,OAAO,kBAAkB,MAAM,mDAAmD,CAAC;AACnF,OAAO,mBAAmB,MAAM,oDAAoD,CAAC;AACrF,OAA+B,EAC3B,8BAA8B,EACjC,MAAM,6BAA6B,CAAC;AAIrC,OAAO,KAAK,MAAM,sCAAsC,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,eAAe,EAAE,MAAM,sDAAsD,CAAC;AACvF,OAAO,QAAQ,MAAM,kEAAkE,CAAC;AAExF,qCAAqC;AACrC,MAAM,CAAC,OAAO,CAAC,KAAK,UAAU,eAAe,CACzC,UAAkC;IAElC,IAAI,CAAC;QACD,8BAA8B,CAAC,UAAU,CAAC,CAAC;QAC3C,MAAM,qBAAqB,GAAG,UAAU,CAAC,IAAI,CAAC,qBAAqB,CAAC;QACpE,MAAM,mBAAmB,GAAG,MAAM,KAAK,CACnC,GAAG,EAAE,CACD,mBAAmB,CAAC,sBAAsB,CAAC;YACvC,GAAG,EAAE,UAAU,CAAC,IAAI,CAAC,gBAAgB;SACxC,CAAC,EACN,CAAC,EACD,CAAC,EACD,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CACR,OAAO,CAAC,KAAK,CACT,oDAAoD,IAAI,UAAU,EAClE,CAAC,CAAC,OAAO,CACZ,CACR,CAAC;QAEF,MAAM,kBAAkB,GAAG,MAAM,KAAK,CAClC,GAAG,EAAE,CACD,OAAO,CAAC,OAAO,CACX,IAAI,kBAAkB,CAAC;YACnB,UAAU,EAAE,UAAU,CAAC,IAAI,CAAC,eAAe;SAC9C,CAAC,CACL,EACL,CAAC,EACD,CAAC,EACD,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CACR,OAAO,CAAC,KAAK,CACT,uDAAuD,IAAI,UAAU,EACrE,CAAC,CAAC,OAAO,CACZ,CACR,CAAC;QAEF,MAAM,sBAAsB,GAAG,IAAI,QAAQ,CAAC;YACxC,OAAO,EAAE,UAAU,CAAC,IAAI,CAAC,mBAAmB;YAC5C,SAAS,EAAE,UAAU,CAAC,IAAI,CAAC,SAAS;YACpC,MAAM,EAAE,UAAU,CAAC,IAAI,CAAC,MAAM;YAC9B,UAAU,EAAE,CAAC;YACb,QAAQ,EAAE,eAAe;SAC5B,CAAC,CAAC;QAEH,IACI,CAAC,CAAC,MAAM,mBAAmB,CAAC,WAAW,CACnC,qBAAqB,CAAC,aAAa,CACtC,CAAC,EACJ,CAAC;YACC,MAAM,eAAe,GAAG,4BAA4B,CAAC,UAAU,CAAC,CAAC;YACjE,MAAM,mBAAmB,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,aAAa,GAAG,UAAU,CAAC,aAAa;YAC1C,CAAC,CAAC,IAAI,wBAAwB,CAAC,UAAU,CAAC,aAAa,CAAC;YACxD,CAAC,CAAC,IAAI,sBAAsB,CACtB,qBAAqB,CAAC,cAAc;gBAChC,UAAU,CAAC,cAAc,EAC7B,qBAAqB,CAAC,OAAO,IAAI,UAAU,CAAC,OAAO,CACtD,CAAC;QAER,MAAM,OAAO,GAAG,IAAI,OAAO,CAAC,aAAa,CAAC,CAAC;QAE3C,IAAI,aAAgC,CAAC;QACrC,IAAI,aAA4B,CAAC;QAEjC,IAAI,UAAU,CAAC,QAAQ,KAAK,gBAAgB,EAAE,CAAC;YAC3C,aAAa,GAAG,2BAA2B,CACvC,UAAU,EACV,OAAO,EACP,kBAAkB,EAClB,mBAAmB,EACnB,sBAAsB,CACzB,CAAC;YACF,aAAa,GAAG;gBACZ,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,EAAE,EAAE,UAAU,CAAC,EAAE;gBACjB,OAAO,EAAE,UAAU,CAAC,OAAO,IAAI,EAAE;gBACjC,eAAe,EAAE,UAAU,CAAC,eAAe,IAAI,EAAE;gBACjD,eAAe,EAAE,EAAE;gBACnB,KAAK,EAAE,IAAI;gBACX,WAAW,EAAE,IAAI;gBACjB,aAAa,EAAE,KAAK;gBACpB,cAAc,EAAE,IAAI;gBACpB,aAAa,EAAE,aAAa;aAC/B,CAAC;QACN,CAAC;aAAM,IAAI,UAAU,CAAC,QAAQ,KAAK,eAAe,EAAE,CAAC;YACjD,MAAM,WAAW,GAAG,IAAI,WAAW,CAC/B,UAAU,CAAC,IAAI,CAAC,WAAW,EAC3B,UAAU,CAAC,IAAI,CAAC,cAAc,EAC9B,UAAU,CAAC,IAAI,CAAC,cAAc,CACjC,CAAC;YACF,aAAa,GAAG,0BAA0B,CACtC,UAAU,EACV,OAAO,EACP,kBAAkB,EAClB,mBAAmB,EACnB,WAAW,EACX,sBAAsB,CACzB,CAAC;YACF,aAAa,GAAG;gBACZ,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,EAAE,EAAE,UAAU,CAAC,EAAE;gBACjB,OAAO,EAAE,CAAC,2BAA2B,EAAE,gBAAgB,CAAC;gBACxD,eAAe,EAAE,EAAE;gBACnB,eAAe,EAAE,EAAE;gBACnB,KAAK,EAAE,IAAI;gBACX,WAAW,EAAE,KAAK;gBAClB,aAAa,EAAE,KAAK;gBACpB,cAAc,EAAE,IAAI;gBACpB,aAAa,EAAE,aAAa;gBAC5B,UAAU,EAAE,CAAC;aAChB,CAAC;QACN,CAAC;aAAM,CAAC;YACJ,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,CAAC,aAAa,CAAC,CAAC,KAAK,CAAC,CAAC,CAAQ,EAAE,EAAE;YACrC,OAAO,CAAC,KAAK,CAAC,0BAA0B,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC,CAAC,CAAC;IACP,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CACT,wCAAwC,GAAI,CAAW,CAAC,OAAO,EAC/D,CAAC,CACJ,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { ItemType } from "./indexSchema.js";
|
|
2
|
+
import { CreateEmbeddingText } from "./createEmbeddingText.js";
|
|
3
|
+
import { SemanticIndexerArguments } from "./commonYargs.js";
|
|
4
|
+
import { ChunkStrategyType } from "./chunker.js";
|
|
5
|
+
export default interface SemanticIndexerOptions {
|
|
6
|
+
argv: SemanticIndexerArguments;
|
|
7
|
+
id: string;
|
|
8
|
+
itemType: ItemType;
|
|
9
|
+
aspects?: string[];
|
|
10
|
+
optionalAspects?: string[];
|
|
11
|
+
formatTypes?: string[];
|
|
12
|
+
createEmbeddingText: CreateEmbeddingText;
|
|
13
|
+
chunkStrategy?: ChunkStrategyType;
|
|
14
|
+
chunkSizeLimit?: number;
|
|
15
|
+
overlap?: number;
|
|
16
|
+
autoDownloadFile?: boolean;
|
|
17
|
+
timeout?: string;
|
|
18
|
+
}
|
|
19
|
+
export declare function validateSemanticIndexerOptions(options: SemanticIndexerOptions): void;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export function validateSemanticIndexerOptions(options) {
|
|
2
|
+
if (options.chunkSizeLimit !== undefined && (
|
|
3
|
+
!Number.isInteger(options.chunkSizeLimit) ||
|
|
4
|
+
options.chunkSizeLimit <= 0)) {
|
|
5
|
+
throw new Error("'chunkSizeLimit' must be a positive integer");
|
|
6
|
+
}
|
|
7
|
+
if (options.overlap !== undefined && (
|
|
8
|
+
!Number.isInteger(options.overlap) || options.overlap < 0)) {
|
|
9
|
+
throw new Error("'overlap' must be a non-negative integer");
|
|
10
|
+
}
|
|
11
|
+
if (options.chunkSizeLimit &&
|
|
12
|
+
options.overlap &&
|
|
13
|
+
options.chunkSizeLimit <= options.overlap * 2) {
|
|
14
|
+
throw new Error("'overlap' must be less than half of 'chunkSizeLimit'");
|
|
15
|
+
}
|
|
16
|
+
if (options.itemType === "registryRecord") {
|
|
17
|
+
if (!options.aspects || options.aspects.length === 0) {
|
|
18
|
+
throw new Error("'aspects' is required");
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
if (options.itemType === "storageObject") {
|
|
22
|
+
if (!options.formatTypes || options.formatTypes.length === 0) {
|
|
23
|
+
throw new Error("'formatTypes' is required");
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semanticIndexerOptions.js","sourceRoot":"","sources":["../src/semanticIndexerOptions.ts"],"names":[],"mappings":"AAqBA,MAAM,UAAU,8BAA8B,CAC1C,OAA+B;IAE/B,IACI,OAAO,CAAC,cAAc,KAAK,SAAS;QACpC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,OAAO,CAAC,cAAc,CAAC;YACtC,OAAO,CAAC,cAAc,IAAI,CAAC,CAAC,EAClC,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACnE,CAAC;IACD,IACI,OAAO,CAAC,OAAO,KAAK,SAAS;QAC7B,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC,CAAC,EAC7D,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAChE,CAAC;IACD,IACI,OAAO,CAAC,cAAc;QACtB,OAAO,CAAC,OAAO;QACf,OAAO,CAAC,cAAc,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC,EAC/C,CAAC;QACC,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC5E,CAAC;IACD,IAAI,OAAO,CAAC,QAAQ,KAAK,gBAAgB,EAAE,CAAC;QACxC,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;QAC7C,CAAC;IACL,CAAC;IACD,IAAI,OAAO,CAAC,QAAQ,KAAK,eAAe,EAAE,CAAC;QACvC,IAAI,CAAC,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3D,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;QACjD,CAAC;IACL,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { SinonStub, SinonFakeTimers } from "sinon";
|
|
2
|
+
import SemanticIndexerOptions from "../semanticIndexerOptions.js";
|
|
3
|
+
export declare class BaseSemanticIndexerTest {
|
|
4
|
+
chunker: any;
|
|
5
|
+
embeddingApiClient: any;
|
|
6
|
+
opensearchApiClient: any;
|
|
7
|
+
minioClient: any;
|
|
8
|
+
registry: any;
|
|
9
|
+
createEmbeddingTextStub: SinonStub;
|
|
10
|
+
consoleLogStub: SinonStub;
|
|
11
|
+
consoleWarnStub: SinonStub;
|
|
12
|
+
consoleErrorStub: SinonStub;
|
|
13
|
+
clock: SinonFakeTimers;
|
|
14
|
+
DEFAULT_FAKE_TIME: Date;
|
|
15
|
+
DEFAULT_PARENT_RECORD_ID: string;
|
|
16
|
+
DEFAULT_CREATE_EMBEDDING_TEXT_RESULT: {
|
|
17
|
+
text: string;
|
|
18
|
+
};
|
|
19
|
+
userConfig: any;
|
|
20
|
+
constructor({ createEmbeddingTextResult, fakeTime, overridesConfig, suppressConsoleLogs
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
}?: {createEmbeddingTextResult?: any;fakeTime?: Date;overridesConfig?: Partial<SemanticIndexerOptions>;suppressConsoleLogs?: boolean;});
|
|
26
|
+
cleanup(): void;
|
|
27
|
+
updateUserConfig(overrides?: Partial<SemanticIndexerOptions>): any;
|
|
28
|
+
getCurrentTimeString(): string;
|
|
29
|
+
expectCalledWith(stub: SinonStub, callIndex: number, ...expectedArgs: any[]): void;
|
|
30
|
+
expectSuccessCalls(options?: {
|
|
31
|
+
createEmbeddingTextCallCount?: number;
|
|
32
|
+
chunkCallCount?: number;
|
|
33
|
+
embeddingApiCallCount?: number;
|
|
34
|
+
bulkIndexCallCount?: number;
|
|
35
|
+
deleteByQueryCallCount?: number;
|
|
36
|
+
}): void;
|
|
37
|
+
expectIndexedDoc(expectedDoc: any, callIndex?: number): void;
|
|
38
|
+
expectIndexedDocs(expectedDocs: any[], callIndex?: number): void;
|
|
39
|
+
getIndexedDocs(callIndex?: number): any[];
|
|
40
|
+
}
|
|
41
|
+
export declare function createFakeSemanticIndexerConfig(overrideConfig?: Partial<SemanticIndexerOptions>): SemanticIndexerOptions;
|