@toxplanet/pegasus-sdk 1.1.17 → 1.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/chemicals.js CHANGED
@@ -1,1062 +1,1088 @@
1
- const { logError, logInfo } = require('@toxplanet/tphelper/logging');
2
- const { getDrizzle, schema } = require('./db');
3
- const { eq, sql, and, inArray, arrayContains } = require('drizzle-orm');
4
- const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
5
-
6
- const SEARCH_BOOST_EXACT_PRIMARY = 100;
7
- const SEARCH_BOOST_PREFIX_PRIMARY = 50;
8
- const SEARCH_BOOST_EXACT_SECONDARY = 30;
9
- const SEARCH_BOOST_PREFIX_SECONDARY = 10;
10
-
11
- const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'DTXSID', 'EINECS', 'EC']);
12
-
13
- function escapeLikePattern(value) {
14
- return value.replace(/[%_\\]/g, '\\$&');
15
- }
16
-
17
- class ChemicalsService {
18
- constructor(connection) {
19
- this.connection = connection;
20
- this.db = null;
21
- this.sqsClient = null;
22
- }
23
-
24
- getDb() {
25
- if (!this.db) {
26
- this.db = getDrizzle(this.connection.pgPool);
27
- }
28
- return this.db;
29
- }
30
-
31
- async sendSqlWriteFailure({ sql, parameters, error, retryCount, failedAt }) {
32
- try {
33
- const region = process.env.AWS_REGION || this.connection.region;
34
- const { awsAccountId, environment } = this.connection.config;
35
- const defaultQueueUrl = awsAccountId
36
- ? `https://sqs.${region}.amazonaws.com/${awsAccountId}/cr-pegasus-failed-items-${environment}`
37
- : null;
38
- const queueUrl = process.env.SQS_FAILED_ITEMS_QUEUE || defaultQueueUrl;
39
-
40
- if (!queueUrl) {
41
- logError('pegasus-sdk', 'sendSqlWriteFailure', 'No SQS queue URL available: set SQS_FAILED_ITEMS_QUEUE or provide awsAccountId in config');
42
- return false;
43
- }
44
-
45
- logInfo('pegasus-sdk', `[sendSqlWriteFailure] Using queue: ${queueUrl}${process.env.SQS_FAILED_ITEMS_QUEUE ? ' (from env)' : ' (default)'}`);
46
-
47
- if (!this.sqsClient) {
48
- this.sqsClient = new SQSClient({ region });
49
- }
50
-
51
- const message = {
52
- MessageType: 'SqlWriteFailure',
53
- SourceService: this.connection.config.sourceService || 'pegasus-sdk',
54
- Timestamp: (failedAt || new Date()).toISOString(),
55
- Sql: sql,
56
- Parameters: parameters,
57
- OriginalError: error.message,
58
- RetryCount: retryCount
59
- };
60
-
61
- const command = new SendMessageCommand({
62
- QueueUrl: queueUrl,
63
- MessageBody: JSON.stringify(message)
64
- });
65
-
66
- const response = await this.sqsClient.send(command);
67
- logInfo('pegasus-sdk', `[sendSqlWriteFailure] SqlWriteFailure posted to SQS: MessageId=${response.MessageId}, RetryCount=${retryCount}`);
68
- return true;
69
- } catch (sqsError) {
70
- logError('pegasus-sdk', 'sendSqlWriteFailure', 'Failed to post SqlWriteFailure to SQS', sqsError);
71
- return false;
72
- }
73
- }
74
-
75
- _buildChemicalUpsertSql(chemical) {
76
- const sql = [
77
- 'INSERT INTO chemicals (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)',
78
- 'VALUES (@source_id, @chemical_name, @chemical_meta::jsonb, @chemical_identifiers::jsonb, @chemical_synonyms, @chemical_categories, @created_at, @updated_at)',
79
- 'ON CONFLICT (source_id) DO UPDATE SET',
80
- ' chemical_name = @chemical_name,',
81
- ' chemical_meta = @chemical_meta::jsonb,',
82
- ' chemical_identifiers = @chemical_identifiers::jsonb,',
83
- ' chemical_synonyms = @chemical_synonyms,',
84
- ' chemical_categories = @chemical_categories,',
85
- ' updated_at = @updated_at'
86
- ].join('\n');
87
-
88
- const serializeDate = (d) => d instanceof Date ? d.toISOString() : d;
89
-
90
- const parameters = {
91
- '@source_id': chemical.sourceId,
92
- '@chemical_name': chemical.chemicalName,
93
- '@chemical_meta': JSON.stringify(chemical.chemicalMeta ?? {}),
94
- '@chemical_identifiers': JSON.stringify(chemical.chemicalIdentifiers ?? {}),
95
- '@chemical_synonyms': JSON.stringify(chemical.chemicalSynonyms ?? []),
96
- '@chemical_categories': JSON.stringify(chemical.chemicalCategories ?? []),
97
- '@created_at': serializeDate(chemical.createdAt),
98
- '@updated_at': serializeDate(chemical.updatedAt)
99
- };
100
-
101
- return { sql, parameters };
102
- }
103
-
104
- _buildDebugSql(chemical) {
105
- const esc = (s) => `'${String(s ?? '').replace(/'/g, "''")}'`;
106
- const escJson = (v) => `'${JSON.stringify(v ?? {}).replace(/'/g, "''")}'`;
107
- const escArr = (arr) => {
108
- if (!Array.isArray(arr) || arr.length === 0) return `ARRAY[]::text[]`;
109
- return `ARRAY[${arr.map(s => esc(s)).join(', ')}]`;
110
- };
111
- const escDate = (d) => esc(d instanceof Date ? d.toISOString() : (d ?? new Date().toISOString()));
112
-
113
- return [
114
- `INSERT INTO chemicals`,
115
- ` (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)`,
116
- `VALUES (`,
117
- ` ${esc(chemical.sourceId)},`,
118
- ` ${esc(chemical.chemicalName)},`,
119
- ` ${escJson(chemical.chemicalMeta)}::jsonb,`,
120
- ` ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
121
- ` ${escArr(chemical.chemicalSynonyms)},`,
122
- ` ${escArr(chemical.chemicalCategories)},`,
123
- ` ${escDate(chemical.createdAt)},`,
124
- ` ${escDate(chemical.updatedAt)}`,
125
- `)`,
126
- `ON CONFLICT (source_id) DO UPDATE SET`,
127
- ` chemical_name = ${esc(chemical.chemicalName)},`,
128
- ` chemical_meta = ${escJson(chemical.chemicalMeta)}::jsonb,`,
129
- ` chemical_identifiers = ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
130
- ` chemical_synonyms = ${escArr(chemical.chemicalSynonyms)},`,
131
- ` chemical_categories = ${escArr(chemical.chemicalCategories)},`,
132
- ` updated_at = NOW();`
133
- ].join('\n');
134
- }
135
-
136
- async bulkIndexFielded(documents) {
137
- try {
138
- logInfo('pegasus-sdk', `[bulkIndexFielded] Starting bulk index with ${documents?.length || 0} documents`);
139
-
140
- if (!documents || documents.length === 0) {
141
- logInfo('pegasus-sdk', `[bulkIndexFielded] No documents provided, returning empty result`);
142
- return { indexed: 0, errors: [], results: [] };
143
- }
144
-
145
- const db = this.getDb();
146
- const results = [];
147
- const errors = [];
148
-
149
- logInfo('pegasus-sdk', `[bulkIndexFielded] Database connection established`);
150
-
151
- for (let i = 0; i < documents.length; i++) {
152
- const doc = documents[i];
153
- logInfo('pegasus-sdk', `[bulkIndexFielded] Processing document ${i}: source_id=${doc.source_id}, chemical_name=${doc.chemical_name}`);
154
-
155
- const parseDate = (dateValue) => {
156
- if (!dateValue) return new Date();
157
- if (dateValue instanceof Date) return dateValue;
158
- if (typeof dateValue === 'string') return new Date(dateValue);
159
- return new Date();
160
- };
161
-
162
- const chemical = {
163
- sourceId: doc.source_id || doc._id,
164
- chemicalName: doc.chemical_name || doc.name,
165
- chemicalMeta: doc.chemical_meta || {},
166
- chemicalIdentifiers: doc.chemical_identifiers || {},
167
- chemicalSynonyms: doc.chemical_synonyms || [],
168
- chemicalCategories: doc.chemical_categories || [],
169
- createdAt: parseDate(doc.created_at),
170
- updatedAt: parseDate(doc.updated_at),
171
- ...(doc.imported_at && { importedAt: doc.imported_at }),
172
- ...(doc.chemical_id && { chemicalId: doc.chemical_id })
173
- };
174
-
175
- logInfo('pegasus-sdk', `[bulkIndexFielded] Prepared chemical object: sourceId=${chemical.sourceId}, chemicalName=${chemical.chemicalName}`);
176
- logInfo('pegasus-sdk', `[bulkIndexFielded] DEBUG SQL for document ${i}:\n${this._buildDebugSql(chemical)}`);
177
-
178
- const isConnectionError = (err) =>
179
- err.message?.toLowerCase().includes('timeout') ||
180
- err.message?.toLowerCase().includes('connection') ||
181
- err.code === 'ECONNREFUSED' ||
182
- err.code === 'ETIMEDOUT';
183
-
184
- const attemptUpsert = () =>
185
- db.insert(schema.chemicals)
186
- .values(chemical)
187
- .onConflictDoUpdate({
188
- target: schema.chemicals.sourceId,
189
- set: {
190
- chemicalName: chemical.chemicalName,
191
- chemicalMeta: chemical.chemicalMeta,
192
- chemicalIdentifiers: chemical.chemicalIdentifiers,
193
- chemicalSynonyms: chemical.chemicalSynonyms,
194
- chemicalCategories: chemical.chemicalCategories,
195
- updatedAt: new Date()
196
- }
197
- })
198
- .returning({
199
- chemicalId: schema.chemicals.chemicalId,
200
- sourceId: schema.chemicals.sourceId
201
- });
202
-
203
- let lastError = null;
204
- let retryCount = 0;
205
- const failedAt = new Date();
206
-
207
- try {
208
- const [result] = await attemptUpsert();
209
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully: ${result?.chemicalId || 'no ID returned'}`);
210
- results.push({ index: i, success: true, result });
211
- continue;
212
- } catch (firstErr) {
213
- lastError = firstErr;
214
- if (!isConnectionError(firstErr)) {
215
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} first attempt failed (${firstErr.message}), retrying once`);
216
- try {
217
- const [result] = await attemptUpsert();
218
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully on retry: ${result?.chemicalId || 'no ID returned'}`);
219
- results.push({ index: i, success: true, result });
220
- continue;
221
- } catch (retryErr) {
222
- lastError = retryErr;
223
- retryCount = 1;
224
- }
225
- }
226
- }
227
-
228
- logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} failed after ${retryCount} local retries (source_id=${chemical.sourceId})`, lastError);
229
-
230
- const { sql: failureSql, parameters: failureParams } = this._buildChemicalUpsertSql(chemical);
231
- const queued = await this.sendSqlWriteFailure({
232
- sql: failureSql,
233
- parameters: failureParams,
234
- error: lastError,
235
- retryCount,
236
- failedAt
237
- });
238
-
239
- if (queued) {
240
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} (source_id=${chemical.sourceId}) queued for repair via SQS`);
241
- } else {
242
- logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} (source_id=${chemical.sourceId}) failed and could not be queued — data loss risk`, lastError);
243
- }
244
-
245
- results.push({ index: i, success: false, error: lastError.message, queued });
246
- errors.push({ document: doc, error: lastError.message, queued });
247
- }
248
-
249
- const successCount = results.filter(r => r.success).length;
250
- const queuedCount = results.filter(r => !r.success && r.queued).length;
251
- logInfo('pegasus-sdk', `[bulkIndexFielded] Bulk index complete: ${successCount}/${documents.length} succeeded, ${queuedCount} queued for repair, ${errors.length - queuedCount} unhandled errors`);
252
-
253
- return { indexed: successCount, errors, results };
254
- } catch (error) {
255
- logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFielded', error);
256
- throw error;
257
- }
258
- }
259
-
260
- async bulkIndexFulltext(documents) {
261
- try {
262
- return { acknowledged: true, count: documents?.length || 0 };
263
- } catch (error) {
264
- logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFulltext', error);
265
- throw error;
266
- }
267
- }
268
-
269
- async bulkIndexSubstances(substances) {
270
- try {
271
- const documents = substances.map(substance => ({
272
- source_id: substance.substance_id || substance.id,
273
- chemical_name: substance.name || substance.substance_name,
274
- chemical_meta: substance.meta || {},
275
- chemical_identifiers: substance.identifiers || {},
276
- chemical_synonyms: substance.synonyms || [],
277
- chemical_categories: substance.categories || substance.substance_types || [],
278
- created_at: substance.created_at,
279
- updated_at: substance.updated_at,
280
- imported_at: substance.imported_at
281
- }));
282
-
283
- return await this.bulkIndexFielded(documents);
284
- } catch (error) {
285
- logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexSubstances', error);
286
- throw error;
287
- }
288
- }
289
-
290
- async createChemical(chemical) {
291
- try {
292
- const db = this.getDb();
293
-
294
- const [result] = await db
295
- .insert(schema.chemicals)
296
- .values({
297
- sourceId: chemical.source_id,
298
- chemicalName: chemical.chemical_name,
299
- chemicalMeta: chemical.chemical_meta,
300
- chemicalIdentifiers: chemical.chemical_identifiers,
301
- chemicalSynonyms: chemical.chemical_synonyms,
302
- chemicalCategories: chemical.chemical_categories,
303
- createdAt: chemical.created_at || new Date(),
304
- updatedAt: chemical.updated_at || new Date(),
305
- ...(chemical.imported_at && { importedAt: chemical.imported_at }),
306
- ...(chemical.chemical_id && { chemicalId: chemical.chemical_id })
307
- })
308
- .returning();
309
-
310
- return result;
311
- } catch (error) {
312
- logError('pegasus-sdk', 'ChemicalsService', 'createChemical', error);
313
- throw error;
314
- }
315
- }
316
-
317
- async updateChemical(chemicalId, updates) {
318
- try {
319
- const db = this.getDb();
320
-
321
- const updateData = {};
322
- if (updates.chemical_name) updateData.chemicalName = updates.chemical_name;
323
- if (updates.chemical_meta) updateData.chemicalMeta = updates.chemical_meta;
324
- if (updates.chemical_identifiers) updateData.chemicalIdentifiers = updates.chemical_identifiers;
325
- if (updates.chemical_synonyms) updateData.chemicalSynonyms = updates.chemical_synonyms;
326
- if (updates.chemical_categories) updateData.chemicalCategories = updates.chemical_categories;
327
- updateData.updatedAt = new Date();
328
-
329
- const [result] = await db
330
- .update(schema.chemicals)
331
- .set(updateData)
332
- .where(eq(schema.chemicals.chemicalId, chemicalId))
333
- .returning();
334
-
335
- return result || null;
336
- } catch (error) {
337
- logError('pegasus-sdk', 'ChemicalsService', 'updateChemical', error);
338
- throw error;
339
- }
340
- }
341
-
342
- async deleteChemical(chemicalId) {
343
- try {
344
- const db = this.getDb();
345
-
346
- const [deleted] = await db
347
- .delete(schema.chemicals)
348
- .where(eq(schema.chemicals.chemicalId, chemicalId))
349
- .returning();
350
-
351
- return deleted || null;
352
- } catch (error) {
353
- logError('pegasus-sdk', 'ChemicalsService', 'deleteChemical', error);
354
- throw error;
355
- }
356
- }
357
-
358
- async deleteBySourceId(sourceId) {
359
- try {
360
- const db = this.getDb();
361
-
362
- const [deleted] = await db
363
- .delete(schema.chemicals)
364
- .where(eq(schema.chemicals.sourceId, sourceId))
365
- .returning();
366
-
367
- return deleted || null;
368
- } catch (error) {
369
- logError('pegasus-sdk', 'ChemicalsService', 'deleteBySourceId', error);
370
- throw error;
371
- }
372
- }
373
-
374
- async deleteCollection(collectionName) {
375
- try {
376
- const db = this.getDb();
377
-
378
- const deleted = await db
379
- .delete(schema.chemicals)
380
- .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
381
- .returning();
382
-
383
- return { deletedCount: deleted.length, deleted };
384
- } catch (error) {
385
- logError('pegasus-sdk', 'ChemicalsService', 'deleteCollection', error);
386
- throw error;
387
- }
388
- }
389
-
390
- async updateCollectionProperty(collectionName, propertyPath, newValue) {
391
- try {
392
- const db = this.getDb();
393
- const pathArray = propertyPath.split('.');
394
- const valueJson = JSON.stringify(newValue);
395
-
396
- const results = await db
397
- .update(schema.chemicals)
398
- .set({
399
- chemicalMeta: sql`jsonb_set(${schema.chemicals.chemicalMeta}, ${pathArray}::text[], ${valueJson}::jsonb)`,
400
- updatedAt: new Date()
401
- })
402
- .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
403
- .returning();
404
-
405
- return { updatedCount: results.length, updated: results };
406
- } catch (error) {
407
- logError('pegasus-sdk', 'ChemicalsService', 'updateCollectionProperty', error);
408
- throw error;
409
- }
410
- }
411
-
412
- async bulkUpdateProperty(filter, propertyPath, newValue) {
413
- try {
414
- const db = this.getDb();
415
-
416
- let whereCondition = sql`1=1`;
417
-
418
- if (filter.chemicalIds && filter.chemicalIds.length > 0) {
419
- whereCondition = inArray(schema.chemicals.chemicalId, filter.chemicalIds);
420
- } else if (filter.sourceIds && filter.sourceIds.length > 0) {
421
- whereCondition = inArray(schema.chemicals.sourceId, filter.sourceIds);
422
- } else if (filter.category) {
423
- whereCondition = arrayContains(schema.chemicals.chemicalCategories, [filter.category]);
424
- }
425
-
426
- const pathArray = propertyPath.split('.');
427
- const valueJson = JSON.stringify(newValue);
428
-
429
- const results = await db
430
- .update(schema.chemicals)
431
- .set({
432
- chemicalMeta: sql`jsonb_set(COALESCE(${schema.chemicals.chemicalMeta}, '{}'), ${pathArray}::text[], ${valueJson}::jsonb)`,
433
- updatedAt: new Date()
434
- })
435
- .where(whereCondition)
436
- .returning();
437
-
438
- return { updatedCount: results.length, updated: results };
439
- } catch (error) {
440
- logError('pegasus-sdk', 'ChemicalsService', 'bulkUpdateProperty', error);
441
- throw error;
442
- }
443
- }
444
-
445
- async getChemicalById(chemicalId) {
446
- try {
447
- const db = this.getDb();
448
-
449
- const [result] = await db
450
- .select()
451
- .from(schema.chemicals)
452
- .where(eq(schema.chemicals.chemicalId, chemicalId))
453
- .limit(1);
454
-
455
- return result || null;
456
- } catch (error) {
457
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalById', error);
458
- throw error;
459
- }
460
- }
461
-
462
- async getChemicalBySourceId(sourceId) {
463
- try {
464
- const db = this.getDb();
465
-
466
- const [result] = await db
467
- .select()
468
- .from(schema.chemicals)
469
- .where(eq(schema.chemicals.sourceId, sourceId))
470
- .limit(1);
471
-
472
- return result || null;
473
- } catch (error) {
474
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalBySourceId', error);
475
- throw error;
476
- }
477
- }
478
-
479
- async getChemicalsByCAS(casNumber) {
480
- try {
481
- const db = this.getDb();
482
-
483
- const results = await db
484
- .select()
485
- .from(schema.chemicals)
486
- .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
487
-
488
- return results;
489
- } catch (error) {
490
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByCAS', error);
491
- throw error;
492
- }
493
- }
494
-
495
- async getChemicalsByIdentifier(identifierType, identifierValue) {
496
- try {
497
- if (!ALLOWED_IDENTIFIER_TYPES.has(identifierType)) {
498
- throw new Error(`Invalid identifier type: ${identifierType}`);
499
- }
500
-
501
- const db = this.getDb();
502
-
503
- const results = await db
504
- .select()
505
- .from(schema.chemicals)
506
- .where(sql`${schema.chemicals.chemicalIdentifiers}->>${identifierType} = ${identifierValue} OR ${schema.chemicals.chemicalIdentifiers}->${identifierType} ? ${identifierValue}`);
507
-
508
- return results;
509
- } catch (error) {
510
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByIdentifier', error);
511
- throw error;
512
- }
513
- }
514
-
515
- async countByCollection(collectionName) {
516
- try {
517
- const db = this.getDb();
518
-
519
- const result = await db
520
- .select({ count: sql`count(*)::int` })
521
- .from(schema.chemicals)
522
- .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
523
-
524
- return { count: result[0].count };
525
- } catch (error) {
526
- logError('pegasus-sdk', 'ChemicalsService', 'countByCollection', error);
527
- throw error;
528
- }
529
- }
530
-
531
- async countByIdentifier(identifierValue) {
532
- try {
533
- const db = this.getDb();
534
-
535
- const searchPattern = `%${escapeLikePattern(identifierValue)}%`;
536
- const result = await db
537
- .select({ count: sql`count(*)::int` })
538
- .from(schema.chemicals)
539
- .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
540
-
541
- return { count: result[0].count };
542
- } catch (error) {
543
- logError('pegasus-sdk', 'ChemicalsService', 'countByIdentifier', error);
544
- throw error;
545
- }
546
- }
547
-
548
- async countByCAS(casNumber) {
549
- try {
550
- const db = this.getDb();
551
-
552
- const result = await db
553
- .select({ count: sql`count(*)::int` })
554
- .from(schema.chemicals)
555
- .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
556
-
557
- return { count: result[0].count };
558
- } catch (error) {
559
- logError('pegasus-sdk', 'ChemicalsService', 'countByCAS', error);
560
- throw error;
561
- }
562
- }
563
-
564
- async getTotalSynonymCount() {
565
- try {
566
- const db = this.getDb();
567
-
568
- const result = await db
569
- .select({ count: sql`sum(array_length(${schema.chemicals.chemicalSynonyms}, 1))::int` })
570
- .from(schema.chemicals);
571
-
572
- return { count: result[0].count || 0 };
573
- } catch (error) {
574
- logError('pegasus-sdk', 'ChemicalsService', 'getTotalSynonymCount', error);
575
- throw error;
576
- }
577
- }
578
-
579
- async getSynonymCount(synonymTerm) {
580
- try {
581
- const db = this.getDb();
582
-
583
- const result = await db
584
- .select({ count: sql`count(*)::int` })
585
- .from(schema.chemicals)
586
- .where(arrayContains(schema.chemicals.chemicalSynonyms, [synonymTerm]));
587
-
588
- return { count: result[0].count };
589
- } catch (error) {
590
- logError('pegasus-sdk', 'ChemicalsService', 'getSynonymCount', error);
591
- throw error;
592
- }
593
- }
594
-
595
- async convertIdentifier(fromIdentifier, toIdentifierType) {
596
- try {
597
- const db = this.getDb();
598
-
599
- const searchPattern = `%${escapeLikePattern(fromIdentifier)}%`;
600
- const chemicals = await db
601
- .select()
602
- .from(schema.chemicals)
603
- .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
604
-
605
- if (chemicals.length === 0) {
606
- return null;
607
- }
608
-
609
- const chemical = chemicals[0];
610
- const identifiers = chemical.chemicalIdentifiers || {};
611
- const toIdentifier = identifiers[toIdentifierType];
612
-
613
- return {
614
- fromIdentifier,
615
- toIdentifierType,
616
- toIdentifier,
617
- chemicalId: chemical.chemicalId,
618
- chemicalName: chemical.chemicalName
619
- };
620
- } catch (error) {
621
- logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifier', error);
622
- throw error;
623
- }
624
- }
625
-
626
- async convertIdentifiersBatch(fromIdentifiers, toIdentifierType) {
627
- try {
628
- const conversions = await Promise.all(
629
- fromIdentifiers.map(fromIdentifier =>
630
- this.convertIdentifier(fromIdentifier, toIdentifierType)
631
- )
632
- );
633
-
634
- return conversions.filter(conversion => conversion !== null);
635
- } catch (error) {
636
- logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifiersBatch', error);
637
- throw error;
638
- }
639
- }
640
-
641
- /**
642
- * Search for chemicals by name using OpenSearch
643
- * @param {string} searchTerm - Name to search for
644
- * @param {number} limit - Maximum number of results (default: 10)
645
- * @returns {Promise<Object>} Search results
646
- */
647
- async searchByName(searchTerm, limit = 10) {
648
- if (!searchTerm) {
649
- return { results: [] };
650
- }
651
-
652
- try {
653
- const opensearchClient = this.connection.getOpenSearchClient();
654
- const indexName = this.connection.getOpenSearchIndex();
655
-
656
- const response = await opensearchClient.search({
657
- index: indexName,
658
- body: {
659
- size: limit,
660
- query: {
661
- bool: {
662
- should: [
663
- { term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
664
- { prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
665
- { term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
666
- { prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
667
- ],
668
- minimum_should_match: 1
669
- }
670
- },
671
- _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
672
- }
673
- });
674
-
675
- const hits = response.body?.hits?.hits || [];
676
- const results = hits.map((hit) => ({
677
- id: hit._source.postgres_id,
678
- name: hit._source.chemical_name,
679
- cas: hit._source.cas_numbers || [],
680
- identifiers: hit._source.identifier_values || [],
681
- synonyms: hit._source.synonyms || [],
682
- score: hit._score
683
- }));
684
-
685
- return { results };
686
- } catch (error) {
687
- logError('pegasus-sdk', 'ChemicalsService', 'searchByName', error);
688
- throw error;
689
- }
690
- }
691
-
692
- /**
693
- * Search for chemicals by synonym using OpenSearch
694
- * @param {string} synonymTerm - Synonym to search for
695
- * @param {number} limit - Maximum number of results (default: 10)
696
- * @returns {Promise<Object>} Search results
697
- */
698
- async searchBySynonym(synonymTerm, limit = 10) {
699
- if (!synonymTerm) {
700
- return { results: [] };
701
- }
702
-
703
- try {
704
- const opensearchClient = this.connection.getOpenSearchClient();
705
- const indexName = this.connection.getOpenSearchIndex();
706
-
707
- const response = await opensearchClient.search({
708
- index: indexName,
709
- body: {
710
- size: limit,
711
- query: {
712
- bool: {
713
- should: [
714
- { term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
715
- { prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
716
- { term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
717
- { prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
718
- ],
719
- minimum_should_match: 1
720
- }
721
- },
722
- _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
723
- }
724
- });
725
-
726
- const hits = response.body?.hits?.hits || [];
727
- const results = hits.map((hit) => ({
728
- id: hit._source.postgres_id,
729
- name: hit._source.chemical_name,
730
- cas: hit._source.cas_numbers || [],
731
- identifiers: hit._source.identifier_values || [],
732
- synonyms: hit._source.synonyms || [],
733
- score: hit._score
734
- }));
735
-
736
- return { results };
737
- } catch (error) {
738
- logError('pegasus-sdk', 'ChemicalsService', 'searchBySynonym', error);
739
- throw error;
740
- }
741
- }
742
-
743
- async countAll() {
744
- try {
745
- const db = this.getDb();
746
- const result = await db
747
- .select({ count: sql`count(*)::int` })
748
- .from(schema.chemicals);
749
- return { count: result[0].count };
750
- } catch (error) {
751
- logError('pegasus-sdk', 'ChemicalsService', 'countAll', error);
752
- throw error;
753
- }
754
- }
755
-
756
- async findChemicalsWithoutDocuments(collectionName, searchTerm, pageSize = 100) {
757
- try {
758
- const db = this.getDb();
759
-
760
- let whereConditions = [];
761
-
762
- if (collectionName) {
763
- whereConditions.push(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
764
- }
765
-
766
- if (searchTerm) {
767
- const searchPattern = `%${escapeLikePattern(searchTerm)}%`;
768
- whereConditions.push(sql`${schema.chemicals.chemicalName} ILIKE ${searchPattern}`);
769
- }
770
-
771
- const whereClause = whereConditions.length > 0 ? and(...whereConditions) : undefined;
772
-
773
- const results = await db
774
- .select()
775
- .from(schema.chemicals)
776
- .where(whereClause)
777
- .limit(pageSize);
778
-
779
- return results;
780
- } catch (error) {
781
- logError('pegasus-sdk', 'ChemicalsService', 'findChemicalsWithoutDocuments', error);
782
- throw error;
783
- }
784
- }
785
-
786
- async countChemicalsWithoutDocuments(collectionName) {
787
- try {
788
- const db = this.getDb();
789
-
790
- const whereClause = collectionName
791
- ? arrayContains(schema.chemicals.chemicalCategories, [collectionName])
792
- : undefined;
793
-
794
- const result = await db
795
- .select({ count: sql`count(*)::int` })
796
- .from(schema.chemicals)
797
- .where(whereClause);
798
-
799
- return { count: result[0].count };
800
- } catch (error) {
801
- logError('pegasus-sdk', 'ChemicalsService', 'countChemicalsWithoutDocuments', error);
802
- throw error;
803
- }
804
- }
805
-
806
- _buildEsHandlers() {
807
- return {
808
- index: async (params) => {
809
- const chemical = params.body;
810
- const result = await this.createChemical(chemical);
811
-
812
- return {
813
- _index: params.index,
814
- _id: result.chemicalId,
815
- _version: 1,
816
- result: 'created',
817
- _source: result
818
- };
819
- },
820
-
821
- bulk: async (params) => {
822
- const operations = params.body || params.operations;
823
-
824
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Starting bulk operation with ${operations?.length || 0} total operations`);
825
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Params index: ${params.index}`);
826
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Operations array type: ${Array.isArray(operations) ? 'array' : typeof operations}`);
827
-
828
- const cdiDocuments = [];
829
- let cdiOpCount = 0;
830
- let otherOpCount = 0;
831
-
832
- for (let i = 0; i < operations.length; i++) {
833
- const op = operations[i];
834
- const isIndexOp = !!(op.index || op.create);
835
- const indexName = op.index?._index || op.create?._index || op.delete?._index || op.update?._index;
836
-
837
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Op[${i}]: action=${Object.keys(op)[0] || 'unknown'}, index=${indexName}`);
838
-
839
- if ((op.index || op.create) &&
840
- (op.index?._index === 'chemical_data_index' || op.create?._index === 'chemical_data_index')) {
841
- const doc = operations[i + 1];
842
- const sourceId = op.index?._id || op.create?._id;
843
-
844
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Found CDI entry: sourceId=${sourceId}, hasDoc=${!!doc}`);
845
-
846
- if (doc && sourceId) {
847
- const cdiDoc = {
848
- source_id: sourceId,
849
- chemical_name: doc.chemical_primary_name || (doc.chemical_names && doc.chemical_names[0]) || null,
850
- chemical_meta: doc.chemical_meta || {},
851
- chemical_identifiers: doc.chemical_identifiers || {},
852
- chemical_synonyms: doc.chemical_synonyms || [],
853
- chemical_categories: doc.chemical_categories || [],
854
- created_at: doc.chemical_created_at ? (typeof doc.chemical_created_at === 'string' ? new Date(doc.chemical_created_at) : doc.chemical_created_at) : new Date(),
855
- updated_at: doc.chemical_updated_at ? (typeof doc.chemical_updated_at === 'string' ? new Date(doc.chemical_updated_at) : doc.chemical_updated_at) : new Date()
856
- };
857
- cdiDocuments.push(cdiDoc);
858
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Extracted CDI doc: ${JSON.stringify({ source_id: cdiDoc.source_id, chemical_name: cdiDoc.chemical_name })}`);
859
- i++;
860
- cdiOpCount++;
861
- } else {
862
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] CDI entry incomplete: sourceId=${sourceId}, doc=${!!doc}`);
863
- }
864
- } else {
865
- otherOpCount++;
866
- }
867
- }
868
-
869
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Scan complete: ${cdiOpCount} CDI docs found, ${otherOpCount} other operations skipped`);
870
-
871
- if (cdiDocuments.length === 0) {
872
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] No CDI documents to index, returning empty no-op response`);
873
- return { took: 0, errors: false, items: [] };
874
- }
875
-
876
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Calling bulkIndexFielded with ${cdiDocuments.length} CDI documents`);
877
-
878
- try {
879
- const result = await this.bulkIndexFielded(cdiDocuments);
880
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] bulkIndexFielded returned: indexed=${result.indexed}, errors=${result.errors.length}`);
881
-
882
- if (result.errors.length > 0) {
883
- logError('pegasus-sdk', 'ChemicalsService.bulk', 'Errors during bulk indexing', result.errors);
884
- }
885
-
886
- return {
887
- took: 1,
888
- errors: result.errors.length > 0,
889
- items: result.results.map((res, idx) => ({
890
- index: {
891
- _index: 'chemical_data_index',
892
- _id: cdiDocuments[idx].source_id,
893
- status: res.success ? 200 : 400,
894
- result: res.success ? 'created' : 'error',
895
- ...(res.success ? {} : { error: { type: 'mapper_parsing_exception', reason: res.error } })
896
- }
897
- }))
898
- };
899
- } catch (error) {
900
- logError('pegasus-sdk', 'ChemicalsService.bulk', 'Fatal error during bulk indexing', error);
901
- throw error;
902
- }
903
- },
904
-
905
- get: async (params) => {
906
- const result = await this.getChemicalBySourceId(params.id);
907
-
908
- if (!result) {
909
- return {
910
- _index: params.index,
911
- _id: params.id,
912
- found: false
913
- };
914
- }
915
-
916
- return {
917
- _index: params.index,
918
- _id: params.id,
919
- _version: 1,
920
- found: true,
921
- _source: result
922
- };
923
- },
924
-
925
- update: async (params) => {
926
- const result = await this.updateChemical(params.id, params.body);
927
-
928
- return {
929
- _index: params.index,
930
- _id: params.id,
931
- _version: 2,
932
- result: result ? 'updated' : 'noop',
933
- _source: result
934
- };
935
- },
936
-
937
- delete: async (params) => {
938
- if (params.index === 'synonym_lookup_index') {
939
- return { _index: params.index, _id: params.id, result: 'not_found' };
940
- }
941
- const result = await this.deleteBySourceId(params.id);
942
-
943
- return {
944
- _index: params.index,
945
- _id: params.id,
946
- result: result ? 'deleted' : 'not_found'
947
- };
948
- },
949
-
950
- deleteByQuery: async (params) => {
951
- const sourceId = params.body?.query?.term?.chemical_set_identifier
952
- || params.body?.query?.term?.source_id;
953
- if (!sourceId) {
954
- return { deleted: 0, failures: [] };
955
- }
956
- const result = await this.deleteBySourceId(sourceId);
957
- return {
958
- deleted: result ? 1 : 0,
959
- failures: []
960
- };
961
- },
962
-
963
- search: async (params) => {
964
- let searchTerm = '';
965
- let limit = params.body?.size || 10;
966
-
967
- if (params.index === 'synonym_lookup_index') {
968
- const query = params.body?.query;
969
- searchTerm = query?.match?.chemical_name ||
970
- query?.term?.chemical_name ||
971
- query?.query_string?.query || '';
972
- const searchResults = await this.searchBySynonym(searchTerm, limit);
973
-
974
- return {
975
- took: 1,
976
- timed_out: false,
977
- _shards: {
978
- total: 1,
979
- successful: 1,
980
- skipped: 0,
981
- failed: 0
982
- },
983
- hits: {
984
- total: {
985
- value: searchResults.results.length,
986
- relation: 'eq'
987
- },
988
- max_score: searchResults.results[0]?.score || 0,
989
- hits: searchResults.results.map(result => ({
990
- _index: params.index,
991
- _id: result.id,
992
- _score: result.score,
993
- _source: {
994
- postgres_id: result.id,
995
- chemical_name: result.name,
996
- cas_numbers: result.cas,
997
- identifier_values: result.identifiers,
998
- synonyms: result.synonyms
999
- }
1000
- }))
1001
- }
1002
- };
1003
- } else {
1004
- const query = params.body?.query;
1005
- searchTerm = query?.match?.chemical_name ||
1006
- query?.term?.chemical_name ||
1007
- query?.query_string?.query || '';
1008
- const searchResults = await this.searchByName(searchTerm, limit);
1009
-
1010
- return {
1011
- took: 1,
1012
- timed_out: false,
1013
- _shards: {
1014
- total: 1,
1015
- successful: 1,
1016
- skipped: 0,
1017
- failed: 0
1018
- },
1019
- hits: {
1020
- total: {
1021
- value: searchResults.results.length,
1022
- relation: 'eq'
1023
- },
1024
- max_score: searchResults.results[0]?.score || 0,
1025
- hits: searchResults.results.map(result => ({
1026
- _index: params.index,
1027
- _id: result.id,
1028
- _score: result.score,
1029
- _source: {
1030
- postgres_id: result.id,
1031
- chemical_name: result.name,
1032
- cas_numbers: result.cas,
1033
- identifier_values: result.identifiers,
1034
- synonyms: result.synonyms
1035
- }
1036
- }))
1037
- }
1038
- };
1039
- }
1040
- },
1041
-
1042
- count: async (params) => {
1043
- if (params.index === 'synonym_lookup_index') {
1044
- return await this.getTotalSynonymCount();
1045
- }
1046
- return await this.countAll();
1047
- }
1048
- };
1049
- }
1050
-
1051
- registerElasticsearchHandlers(elasticsearchService) {
1052
- const configurablePatterns = this.connection.config.indexRoutes?.chemicals || ['chemicals*'];
1053
- const legacyPatterns = ['synonym_lookup_index', 'chemical_data_index', 'chemical_converter_index'];
1054
- const allPatterns = [...new Set([...configurablePatterns, ...legacyPatterns])];
1055
- const handlers = this._buildEsHandlers();
1056
- allPatterns.forEach(pattern => {
1057
- elasticsearchService.registerIndexRoute(pattern, handlers);
1058
- });
1059
- }
1060
- }
1061
-
1
+ const { logError, logInfo } = require('@toxplanet/tphelper/logging');
2
+ const { getDrizzle, schema } = require('./db');
3
+ const { eq, sql, and, inArray, arrayContains } = require('drizzle-orm');
4
+ const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
5
+
6
+ const SEARCH_BOOST_EXACT_PRIMARY = 100;
7
+ const SEARCH_BOOST_PREFIX_PRIMARY = 50;
8
+ const SEARCH_BOOST_EXACT_SECONDARY = 30;
9
+ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
10
+
11
+ const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'DTXSID', 'EINECS', 'EC']);
12
+
13
+ function escapeLikePattern(value) {
14
+ return value.replace(/[%_\\]/g, '\\$&');
15
+ }
16
+
17
+ class ChemicalsService {
18
+ constructor(connection) {
19
+ this.connection = connection;
20
+ this.db = null;
21
+ this.sqsClient = null;
22
+ }
23
+
24
+ getDb() {
25
+ if (!this.db) {
26
+ this.db = getDrizzle(this.connection.pgPool);
27
+ }
28
+ return this.db;
29
+ }
30
+
31
+ async sendSqlWriteFailure({ sql, parameters, error, retryCount, failedAt }) {
32
+ try {
33
+ const region = process.env.AWS_REGION || this.connection.region;
34
+ const { awsAccountId, environment } = this.connection.config;
35
+ const defaultQueueUrl = awsAccountId
36
+ ? `https://sqs.${region}.amazonaws.com/${awsAccountId}/cr-pegasus-failed-items-${environment}`
37
+ : null;
38
+ const queueUrl = process.env.SQS_FAILED_ITEMS_QUEUE || defaultQueueUrl;
39
+
40
+ if (!queueUrl) {
41
+ logError('pegasus-sdk', 'sendSqlWriteFailure', 'No SQS queue URL available: set SQS_FAILED_ITEMS_QUEUE or provide awsAccountId in config');
42
+ return false;
43
+ }
44
+
45
+ logInfo('pegasus-sdk', `[sendSqlWriteFailure] Using queue: ${queueUrl}${process.env.SQS_FAILED_ITEMS_QUEUE ? ' (from env)' : ' (default)'}`);
46
+
47
+ if (!this.sqsClient) {
48
+ this.sqsClient = new SQSClient({ region });
49
+ }
50
+
51
+ const message = {
52
+ MessageType: 'SqlWriteFailure',
53
+ SourceService: this.connection.config.sourceService || 'pegasus-sdk',
54
+ Timestamp: (failedAt || new Date()).toISOString(),
55
+ Sql: sql,
56
+ Parameters: parameters,
57
+ OriginalError: error.message,
58
+ RetryCount: retryCount
59
+ };
60
+
61
+ const command = new SendMessageCommand({
62
+ QueueUrl: queueUrl,
63
+ MessageBody: JSON.stringify(message)
64
+ });
65
+
66
+ const response = await this.sqsClient.send(command);
67
+ logInfo('pegasus-sdk', `[sendSqlWriteFailure] SqlWriteFailure posted to SQS: MessageId=${response.MessageId}, RetryCount=${retryCount}`);
68
+ return true;
69
+ } catch (sqsError) {
70
+ logError('pegasus-sdk', 'sendSqlWriteFailure', 'Failed to post SqlWriteFailure to SQS', sqsError);
71
+ return false;
72
+ }
73
+ }
74
+
75
+ _buildChemicalUpsertSql(chemical) {
76
+ const sql = [
77
+ 'INSERT INTO chemicals (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)',
78
+ 'VALUES (@source_id, @chemical_name, @chemical_meta::jsonb, @chemical_identifiers::jsonb, @chemical_synonyms, @chemical_categories, @created_at, @updated_at)',
79
+ 'ON CONFLICT (source_id) DO UPDATE SET',
80
+ ' chemical_name = @chemical_name,',
81
+ ' chemical_meta = @chemical_meta::jsonb,',
82
+ ' chemical_identifiers = @chemical_identifiers::jsonb,',
83
+ ' chemical_synonyms = @chemical_synonyms,',
84
+ ' chemical_categories = @chemical_categories,',
85
+ ' updated_at = @updated_at'
86
+ ].join('\n');
87
+
88
+ const serializeDate = (d) => d instanceof Date ? d.toISOString() : d;
89
+
90
+ const parameters = {
91
+ '@source_id': chemical.sourceId,
92
+ '@chemical_name': chemical.chemicalName,
93
+ '@chemical_meta': JSON.stringify(chemical.chemicalMeta ?? {}),
94
+ '@chemical_identifiers': JSON.stringify(chemical.chemicalIdentifiers ?? {}),
95
+ '@chemical_synonyms': JSON.stringify(chemical.chemicalSynonyms ?? []),
96
+ '@chemical_categories': JSON.stringify(chemical.chemicalCategories ?? []),
97
+ '@created_at': serializeDate(chemical.createdAt),
98
+ '@updated_at': serializeDate(chemical.updatedAt)
99
+ };
100
+
101
+ return { sql, parameters };
102
+ }
103
+
104
+ _buildDebugSql(chemical) {
105
+ const esc = (s) => `'${String(s ?? '').replace(/'/g, "''")}'`;
106
+ const escJson = (v) => `'${JSON.stringify(v ?? {}).replace(/'/g, "''")}'`;
107
+ const escArr = (arr) => {
108
+ if (!Array.isArray(arr) || arr.length === 0) return `ARRAY[]::text[]`;
109
+ return `ARRAY[${arr.map(s => esc(s)).join(', ')}]`;
110
+ };
111
+ const escDate = (d) => esc(d instanceof Date ? d.toISOString() : (d ?? new Date().toISOString()));
112
+
113
+ return [
114
+ `INSERT INTO chemicals`,
115
+ ` (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)`,
116
+ `VALUES (`,
117
+ ` ${esc(chemical.sourceId)},`,
118
+ ` ${esc(chemical.chemicalName)},`,
119
+ ` ${escJson(chemical.chemicalMeta)}::jsonb,`,
120
+ ` ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
121
+ ` ${escArr(chemical.chemicalSynonyms)},`,
122
+ ` ${escArr(chemical.chemicalCategories)},`,
123
+ ` ${escDate(chemical.createdAt)},`,
124
+ ` ${escDate(chemical.updatedAt)}`,
125
+ `)`,
126
+ `ON CONFLICT (source_id) DO UPDATE SET`,
127
+ ` chemical_name = ${esc(chemical.chemicalName)},`,
128
+ ` chemical_meta = ${escJson(chemical.chemicalMeta)}::jsonb,`,
129
+ ` chemical_identifiers = ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
130
+ ` chemical_synonyms = ${escArr(chemical.chemicalSynonyms)},`,
131
+ ` chemical_categories = ${escArr(chemical.chemicalCategories)},`,
132
+ ` updated_at = NOW();`
133
+ ].join('\n');
134
+ }
135
+
136
+ async bulkIndexFielded(documents) {
137
+ try {
138
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Starting bulk index with ${documents?.length || 0} documents`);
139
+
140
+ if (!documents || documents.length === 0) {
141
+ logInfo('pegasus-sdk', `[bulkIndexFielded] No documents provided, returning empty result`);
142
+ return { indexed: 0, errors: [], results: [] };
143
+ }
144
+
145
+ // Proactively validate the connection before any real query fires.
146
+ // If idle too long, this reconnects first so the real query never faces
147
+ // the full connectionTimeoutMillis wait on a stale pool.
148
+ const reconnected = await this.connection.ensureConnected();
149
+ if (reconnected) {
150
+ this.db = null; // force getDb() to bind to the fresh pool
151
+ }
152
+
153
+ const db = this.getDb();
154
+ const results = [];
155
+ const errors = [];
156
+
157
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Database connection established`);
158
+
159
+ for (let i = 0; i < documents.length; i++) {
160
+ const doc = documents[i];
161
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Processing document ${i}: source_id=${doc.source_id}, chemical_name=${doc.chemical_name}`);
162
+
163
+ const parseDate = (dateValue) => {
164
+ if (!dateValue) return new Date();
165
+ if (dateValue instanceof Date) return dateValue;
166
+ if (typeof dateValue === 'string') return new Date(dateValue);
167
+ return new Date();
168
+ };
169
+
170
+ const chemical = {
171
+ sourceId: doc.source_id || doc._id,
172
+ chemicalName: doc.chemical_name || doc.name,
173
+ chemicalMeta: doc.chemical_meta || {},
174
+ chemicalIdentifiers: doc.chemical_identifiers || {},
175
+ chemicalSynonyms: doc.chemical_synonyms || [],
176
+ chemicalCategories: doc.chemical_categories || [],
177
+ createdAt: parseDate(doc.created_at),
178
+ updatedAt: parseDate(doc.updated_at),
179
+ ...(doc.imported_at && { importedAt: doc.imported_at }),
180
+ ...(doc.chemical_id && { chemicalId: doc.chemical_id })
181
+ };
182
+
183
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Prepared chemical object: sourceId=${chemical.sourceId}, chemicalName=${chemical.chemicalName}`);
184
+ logInfo('pegasus-sdk', `[bulkIndexFielded] DEBUG SQL for document ${i}:\n${this._buildDebugSql(chemical)}`);
185
+
186
+ const isConnectionError = (err) =>
187
+ err.message?.toLowerCase().includes('timeout') ||
188
+ err.message?.toLowerCase().includes('connection') ||
189
+ err.code === 'ECONNREFUSED' ||
190
+ err.code === 'ETIMEDOUT';
191
+
192
+ const attemptUpsert = () =>
193
+ db.insert(schema.chemicals)
194
+ .values(chemical)
195
+ .onConflictDoUpdate({
196
+ target: schema.chemicals.sourceId,
197
+ set: {
198
+ chemicalName: chemical.chemicalName,
199
+ chemicalMeta: chemical.chemicalMeta,
200
+ chemicalIdentifiers: chemical.chemicalIdentifiers,
201
+ chemicalSynonyms: chemical.chemicalSynonyms,
202
+ chemicalCategories: chemical.chemicalCategories,
203
+ updatedAt: new Date()
204
+ }
205
+ })
206
+ .returning({
207
+ chemicalId: schema.chemicals.chemicalId,
208
+ sourceId: schema.chemicals.sourceId
209
+ });
210
+
211
+ let lastError = null;
212
+ let retryCount = 0;
213
+ const failedAt = new Date();
214
+
215
+ try {
216
+ const [result] = await attemptUpsert();
217
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully: ${result?.chemicalId || 'no ID returned'}`);
218
+ this.connection.recordActivity();
219
+ results.push({ index: i, success: true, result });
220
+ continue;
221
+ } catch (firstErr) {
222
+ lastError = firstErr;
223
+
224
+ if (isConnectionError(firstErr)) {
225
+ // Stale pool — rebuild the connection and try once more before queuing
226
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} connection error (${firstErr.message}), reconnecting pool and retrying`);
227
+ try {
228
+ await this.connection.reconnect();
229
+ this.db = null; // force getDb() to bind to the new pool
230
+ const [result] = await attemptUpsert();
231
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully after reconnect: ${result?.chemicalId || 'no ID returned'}`);
232
+ this.connection.recordActivity();
233
+ results.push({ index: i, success: true, result });
234
+ continue;
235
+ } catch (reconnectErr) {
236
+ lastError = reconnectErr;
237
+ retryCount = 1;
238
+ }
239
+ } else {
240
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} first attempt failed (${firstErr.message}), retrying once`);
241
+ try {
242
+ const [result] = await attemptUpsert();
243
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully on retry: ${result?.chemicalId || 'no ID returned'}`);
244
+ this.connection.recordActivity();
245
+ results.push({ index: i, success: true, result });
246
+ continue;
247
+ } catch (retryErr) {
248
+ lastError = retryErr;
249
+ retryCount = 1;
250
+ }
251
+ }
252
+ }
253
+
254
+ logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} failed after ${retryCount} local retries (source_id=${chemical.sourceId})`, lastError);
255
+
256
+ const { sql: failureSql, parameters: failureParams } = this._buildChemicalUpsertSql(chemical);
257
+ const queued = await this.sendSqlWriteFailure({
258
+ sql: failureSql,
259
+ parameters: failureParams,
260
+ error: lastError,
261
+ retryCount,
262
+ failedAt
263
+ });
264
+
265
+ if (queued) {
266
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} (source_id=${chemical.sourceId}) queued for repair via SQS`);
267
+ } else {
268
+ logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} (source_id=${chemical.sourceId}) failed and could not be queued — data loss risk`, lastError);
269
+ }
270
+
271
+ results.push({ index: i, success: false, error: lastError.message, queued });
272
+ errors.push({ document: doc, error: lastError.message, queued });
273
+ }
274
+
275
+ const successCount = results.filter(r => r.success).length;
276
+ const queuedCount = results.filter(r => !r.success && r.queued).length;
277
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Bulk index complete: ${successCount}/${documents.length} succeeded, ${queuedCount} queued for repair, ${errors.length - queuedCount} unhandled errors`);
278
+
279
+ return { indexed: successCount, errors, results };
280
+ } catch (error) {
281
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFielded', error);
282
+ throw error;
283
+ }
284
+ }
285
+
286
+ async bulkIndexFulltext(documents) {
287
+ try {
288
+ return { acknowledged: true, count: documents?.length || 0 };
289
+ } catch (error) {
290
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFulltext', error);
291
+ throw error;
292
+ }
293
+ }
294
+
295
+ async bulkIndexSubstances(substances) {
296
+ try {
297
+ const documents = substances.map(substance => ({
298
+ source_id: substance.substance_id || substance.id,
299
+ chemical_name: substance.name || substance.substance_name,
300
+ chemical_meta: substance.meta || {},
301
+ chemical_identifiers: substance.identifiers || {},
302
+ chemical_synonyms: substance.synonyms || [],
303
+ chemical_categories: substance.categories || substance.substance_types || [],
304
+ created_at: substance.created_at,
305
+ updated_at: substance.updated_at,
306
+ imported_at: substance.imported_at
307
+ }));
308
+
309
+ return await this.bulkIndexFielded(documents);
310
+ } catch (error) {
311
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexSubstances', error);
312
+ throw error;
313
+ }
314
+ }
315
+
316
+ async createChemical(chemical) {
317
+ try {
318
+ const db = this.getDb();
319
+
320
+ const [result] = await db
321
+ .insert(schema.chemicals)
322
+ .values({
323
+ sourceId: chemical.source_id,
324
+ chemicalName: chemical.chemical_name,
325
+ chemicalMeta: chemical.chemical_meta,
326
+ chemicalIdentifiers: chemical.chemical_identifiers,
327
+ chemicalSynonyms: chemical.chemical_synonyms,
328
+ chemicalCategories: chemical.chemical_categories,
329
+ createdAt: chemical.created_at || new Date(),
330
+ updatedAt: chemical.updated_at || new Date(),
331
+ ...(chemical.imported_at && { importedAt: chemical.imported_at }),
332
+ ...(chemical.chemical_id && { chemicalId: chemical.chemical_id })
333
+ })
334
+ .returning();
335
+
336
+ return result;
337
+ } catch (error) {
338
+ logError('pegasus-sdk', 'ChemicalsService', 'createChemical', error);
339
+ throw error;
340
+ }
341
+ }
342
+
343
+ async updateChemical(chemicalId, updates) {
344
+ try {
345
+ const db = this.getDb();
346
+
347
+ const updateData = {};
348
+ if (updates.chemical_name) updateData.chemicalName = updates.chemical_name;
349
+ if (updates.chemical_meta) updateData.chemicalMeta = updates.chemical_meta;
350
+ if (updates.chemical_identifiers) updateData.chemicalIdentifiers = updates.chemical_identifiers;
351
+ if (updates.chemical_synonyms) updateData.chemicalSynonyms = updates.chemical_synonyms;
352
+ if (updates.chemical_categories) updateData.chemicalCategories = updates.chemical_categories;
353
+ updateData.updatedAt = new Date();
354
+
355
+ const [result] = await db
356
+ .update(schema.chemicals)
357
+ .set(updateData)
358
+ .where(eq(schema.chemicals.chemicalId, chemicalId))
359
+ .returning();
360
+
361
+ return result || null;
362
+ } catch (error) {
363
+ logError('pegasus-sdk', 'ChemicalsService', 'updateChemical', error);
364
+ throw error;
365
+ }
366
+ }
367
+
368
+ async deleteChemical(chemicalId) {
369
+ try {
370
+ const db = this.getDb();
371
+
372
+ const [deleted] = await db
373
+ .delete(schema.chemicals)
374
+ .where(eq(schema.chemicals.chemicalId, chemicalId))
375
+ .returning();
376
+
377
+ return deleted || null;
378
+ } catch (error) {
379
+ logError('pegasus-sdk', 'ChemicalsService', 'deleteChemical', error);
380
+ throw error;
381
+ }
382
+ }
383
+
384
+ async deleteBySourceId(sourceId) {
385
+ try {
386
+ const db = this.getDb();
387
+
388
+ const [deleted] = await db
389
+ .delete(schema.chemicals)
390
+ .where(eq(schema.chemicals.sourceId, sourceId))
391
+ .returning();
392
+
393
+ return deleted || null;
394
+ } catch (error) {
395
+ logError('pegasus-sdk', 'ChemicalsService', 'deleteBySourceId', error);
396
+ throw error;
397
+ }
398
+ }
399
+
400
+ async deleteCollection(collectionName) {
401
+ try {
402
+ const db = this.getDb();
403
+
404
+ const deleted = await db
405
+ .delete(schema.chemicals)
406
+ .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
407
+ .returning();
408
+
409
+ return { deletedCount: deleted.length, deleted };
410
+ } catch (error) {
411
+ logError('pegasus-sdk', 'ChemicalsService', 'deleteCollection', error);
412
+ throw error;
413
+ }
414
+ }
415
+
416
+ async updateCollectionProperty(collectionName, propertyPath, newValue) {
417
+ try {
418
+ const db = this.getDb();
419
+ const pathArray = propertyPath.split('.');
420
+ const valueJson = JSON.stringify(newValue);
421
+
422
+ const results = await db
423
+ .update(schema.chemicals)
424
+ .set({
425
+ chemicalMeta: sql`jsonb_set(${schema.chemicals.chemicalMeta}, ${pathArray}::text[], ${valueJson}::jsonb)`,
426
+ updatedAt: new Date()
427
+ })
428
+ .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
429
+ .returning();
430
+
431
+ return { updatedCount: results.length, updated: results };
432
+ } catch (error) {
433
+ logError('pegasus-sdk', 'ChemicalsService', 'updateCollectionProperty', error);
434
+ throw error;
435
+ }
436
+ }
437
+
438
+ async bulkUpdateProperty(filter, propertyPath, newValue) {
439
+ try {
440
+ const db = this.getDb();
441
+
442
+ let whereCondition = sql`1=1`;
443
+
444
+ if (filter.chemicalIds && filter.chemicalIds.length > 0) {
445
+ whereCondition = inArray(schema.chemicals.chemicalId, filter.chemicalIds);
446
+ } else if (filter.sourceIds && filter.sourceIds.length > 0) {
447
+ whereCondition = inArray(schema.chemicals.sourceId, filter.sourceIds);
448
+ } else if (filter.category) {
449
+ whereCondition = arrayContains(schema.chemicals.chemicalCategories, [filter.category]);
450
+ }
451
+
452
+ const pathArray = propertyPath.split('.');
453
+ const valueJson = JSON.stringify(newValue);
454
+
455
+ const results = await db
456
+ .update(schema.chemicals)
457
+ .set({
458
+ chemicalMeta: sql`jsonb_set(COALESCE(${schema.chemicals.chemicalMeta}, '{}'), ${pathArray}::text[], ${valueJson}::jsonb)`,
459
+ updatedAt: new Date()
460
+ })
461
+ .where(whereCondition)
462
+ .returning();
463
+
464
+ return { updatedCount: results.length, updated: results };
465
+ } catch (error) {
466
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkUpdateProperty', error);
467
+ throw error;
468
+ }
469
+ }
470
+
471
+ async getChemicalById(chemicalId) {
472
+ try {
473
+ const db = this.getDb();
474
+
475
+ const [result] = await db
476
+ .select()
477
+ .from(schema.chemicals)
478
+ .where(eq(schema.chemicals.chemicalId, chemicalId))
479
+ .limit(1);
480
+
481
+ return result || null;
482
+ } catch (error) {
483
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalById', error);
484
+ throw error;
485
+ }
486
+ }
487
+
488
+ async getChemicalBySourceId(sourceId) {
489
+ try {
490
+ const db = this.getDb();
491
+
492
+ const [result] = await db
493
+ .select()
494
+ .from(schema.chemicals)
495
+ .where(eq(schema.chemicals.sourceId, sourceId))
496
+ .limit(1);
497
+
498
+ return result || null;
499
+ } catch (error) {
500
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalBySourceId', error);
501
+ throw error;
502
+ }
503
+ }
504
+
505
+ async getChemicalsByCAS(casNumber) {
506
+ try {
507
+ const db = this.getDb();
508
+
509
+ const results = await db
510
+ .select()
511
+ .from(schema.chemicals)
512
+ .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
513
+
514
+ return results;
515
+ } catch (error) {
516
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByCAS', error);
517
+ throw error;
518
+ }
519
+ }
520
+
521
+ async getChemicalsByIdentifier(identifierType, identifierValue) {
522
+ try {
523
+ if (!ALLOWED_IDENTIFIER_TYPES.has(identifierType)) {
524
+ throw new Error(`Invalid identifier type: ${identifierType}`);
525
+ }
526
+
527
+ const db = this.getDb();
528
+
529
+ const results = await db
530
+ .select()
531
+ .from(schema.chemicals)
532
+ .where(sql`${schema.chemicals.chemicalIdentifiers}->>${identifierType} = ${identifierValue} OR ${schema.chemicals.chemicalIdentifiers}->${identifierType} ? ${identifierValue}`);
533
+
534
+ return results;
535
+ } catch (error) {
536
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByIdentifier', error);
537
+ throw error;
538
+ }
539
+ }
540
+
541
+ async countByCollection(collectionName) {
542
+ try {
543
+ const db = this.getDb();
544
+
545
+ const result = await db
546
+ .select({ count: sql`count(*)::int` })
547
+ .from(schema.chemicals)
548
+ .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
549
+
550
+ return { count: result[0].count };
551
+ } catch (error) {
552
+ logError('pegasus-sdk', 'ChemicalsService', 'countByCollection', error);
553
+ throw error;
554
+ }
555
+ }
556
+
557
+ async countByIdentifier(identifierValue) {
558
+ try {
559
+ const db = this.getDb();
560
+
561
+ const searchPattern = `%${escapeLikePattern(identifierValue)}%`;
562
+ const result = await db
563
+ .select({ count: sql`count(*)::int` })
564
+ .from(schema.chemicals)
565
+ .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
566
+
567
+ return { count: result[0].count };
568
+ } catch (error) {
569
+ logError('pegasus-sdk', 'ChemicalsService', 'countByIdentifier', error);
570
+ throw error;
571
+ }
572
+ }
573
+
574
+ async countByCAS(casNumber) {
575
+ try {
576
+ const db = this.getDb();
577
+
578
+ const result = await db
579
+ .select({ count: sql`count(*)::int` })
580
+ .from(schema.chemicals)
581
+ .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
582
+
583
+ return { count: result[0].count };
584
+ } catch (error) {
585
+ logError('pegasus-sdk', 'ChemicalsService', 'countByCAS', error);
586
+ throw error;
587
+ }
588
+ }
589
+
590
+ async getTotalSynonymCount() {
591
+ try {
592
+ const db = this.getDb();
593
+
594
+ const result = await db
595
+ .select({ count: sql`sum(array_length(${schema.chemicals.chemicalSynonyms}, 1))::int` })
596
+ .from(schema.chemicals);
597
+
598
+ return { count: result[0].count || 0 };
599
+ } catch (error) {
600
+ logError('pegasus-sdk', 'ChemicalsService', 'getTotalSynonymCount', error);
601
+ throw error;
602
+ }
603
+ }
604
+
605
+ async getSynonymCount(synonymTerm) {
606
+ try {
607
+ const db = this.getDb();
608
+
609
+ const result = await db
610
+ .select({ count: sql`count(*)::int` })
611
+ .from(schema.chemicals)
612
+ .where(arrayContains(schema.chemicals.chemicalSynonyms, [synonymTerm]));
613
+
614
+ return { count: result[0].count };
615
+ } catch (error) {
616
+ logError('pegasus-sdk', 'ChemicalsService', 'getSynonymCount', error);
617
+ throw error;
618
+ }
619
+ }
620
+
621
+ async convertIdentifier(fromIdentifier, toIdentifierType) {
622
+ try {
623
+ const db = this.getDb();
624
+
625
+ const searchPattern = `%${escapeLikePattern(fromIdentifier)}%`;
626
+ const chemicals = await db
627
+ .select()
628
+ .from(schema.chemicals)
629
+ .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
630
+
631
+ if (chemicals.length === 0) {
632
+ return null;
633
+ }
634
+
635
+ const chemical = chemicals[0];
636
+ const identifiers = chemical.chemicalIdentifiers || {};
637
+ const toIdentifier = identifiers[toIdentifierType];
638
+
639
+ return {
640
+ fromIdentifier,
641
+ toIdentifierType,
642
+ toIdentifier,
643
+ chemicalId: chemical.chemicalId,
644
+ chemicalName: chemical.chemicalName
645
+ };
646
+ } catch (error) {
647
+ logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifier', error);
648
+ throw error;
649
+ }
650
+ }
651
+
652
+ async convertIdentifiersBatch(fromIdentifiers, toIdentifierType) {
653
+ try {
654
+ const conversions = await Promise.all(
655
+ fromIdentifiers.map(fromIdentifier =>
656
+ this.convertIdentifier(fromIdentifier, toIdentifierType)
657
+ )
658
+ );
659
+
660
+ return conversions.filter(conversion => conversion !== null);
661
+ } catch (error) {
662
+ logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifiersBatch', error);
663
+ throw error;
664
+ }
665
+ }
666
+
667
+ /**
668
+ * Search for chemicals by name using OpenSearch
669
+ * @param {string} searchTerm - Name to search for
670
+ * @param {number} limit - Maximum number of results (default: 10)
671
+ * @returns {Promise<Object>} Search results
672
+ */
673
+ async searchByName(searchTerm, limit = 10) {
674
+ if (!searchTerm) {
675
+ return { results: [] };
676
+ }
677
+
678
+ try {
679
+ const opensearchClient = this.connection.getOpenSearchClient();
680
+ const indexName = this.connection.getOpenSearchIndex();
681
+
682
+ const response = await opensearchClient.search({
683
+ index: indexName,
684
+ body: {
685
+ size: limit,
686
+ query: {
687
+ bool: {
688
+ should: [
689
+ { term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
690
+ { prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
691
+ { term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
692
+ { prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
693
+ ],
694
+ minimum_should_match: 1
695
+ }
696
+ },
697
+ _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
698
+ }
699
+ });
700
+
701
+ const hits = response.body?.hits?.hits || [];
702
+ const results = hits.map((hit) => ({
703
+ id: hit._source.postgres_id,
704
+ name: hit._source.chemical_name,
705
+ cas: hit._source.cas_numbers || [],
706
+ identifiers: hit._source.identifier_values || [],
707
+ synonyms: hit._source.synonyms || [],
708
+ score: hit._score
709
+ }));
710
+
711
+ return { results };
712
+ } catch (error) {
713
+ logError('pegasus-sdk', 'ChemicalsService', 'searchByName', error);
714
+ throw error;
715
+ }
716
+ }
717
+
718
+ /**
719
+ * Search for chemicals by synonym using OpenSearch
720
+ * @param {string} synonymTerm - Synonym to search for
721
+ * @param {number} limit - Maximum number of results (default: 10)
722
+ * @returns {Promise<Object>} Search results
723
+ */
724
+ async searchBySynonym(synonymTerm, limit = 10) {
725
+ if (!synonymTerm) {
726
+ return { results: [] };
727
+ }
728
+
729
+ try {
730
+ const opensearchClient = this.connection.getOpenSearchClient();
731
+ const indexName = this.connection.getOpenSearchIndex();
732
+
733
+ const response = await opensearchClient.search({
734
+ index: indexName,
735
+ body: {
736
+ size: limit,
737
+ query: {
738
+ bool: {
739
+ should: [
740
+ { term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
741
+ { prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
742
+ { term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
743
+ { prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
744
+ ],
745
+ minimum_should_match: 1
746
+ }
747
+ },
748
+ _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
749
+ }
750
+ });
751
+
752
+ const hits = response.body?.hits?.hits || [];
753
+ const results = hits.map((hit) => ({
754
+ id: hit._source.postgres_id,
755
+ name: hit._source.chemical_name,
756
+ cas: hit._source.cas_numbers || [],
757
+ identifiers: hit._source.identifier_values || [],
758
+ synonyms: hit._source.synonyms || [],
759
+ score: hit._score
760
+ }));
761
+
762
+ return { results };
763
+ } catch (error) {
764
+ logError('pegasus-sdk', 'ChemicalsService', 'searchBySynonym', error);
765
+ throw error;
766
+ }
767
+ }
768
+
769
+ async countAll() {
770
+ try {
771
+ const db = this.getDb();
772
+ const result = await db
773
+ .select({ count: sql`count(*)::int` })
774
+ .from(schema.chemicals);
775
+ return { count: result[0].count };
776
+ } catch (error) {
777
+ logError('pegasus-sdk', 'ChemicalsService', 'countAll', error);
778
+ throw error;
779
+ }
780
+ }
781
+
782
+ async findChemicalsWithoutDocuments(collectionName, searchTerm, pageSize = 100) {
783
+ try {
784
+ const db = this.getDb();
785
+
786
+ let whereConditions = [];
787
+
788
+ if (collectionName) {
789
+ whereConditions.push(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
790
+ }
791
+
792
+ if (searchTerm) {
793
+ const searchPattern = `%${escapeLikePattern(searchTerm)}%`;
794
+ whereConditions.push(sql`${schema.chemicals.chemicalName} ILIKE ${searchPattern}`);
795
+ }
796
+
797
+ const whereClause = whereConditions.length > 0 ? and(...whereConditions) : undefined;
798
+
799
+ const results = await db
800
+ .select()
801
+ .from(schema.chemicals)
802
+ .where(whereClause)
803
+ .limit(pageSize);
804
+
805
+ return results;
806
+ } catch (error) {
807
+ logError('pegasus-sdk', 'ChemicalsService', 'findChemicalsWithoutDocuments', error);
808
+ throw error;
809
+ }
810
+ }
811
+
812
+ async countChemicalsWithoutDocuments(collectionName) {
813
+ try {
814
+ const db = this.getDb();
815
+
816
+ const whereClause = collectionName
817
+ ? arrayContains(schema.chemicals.chemicalCategories, [collectionName])
818
+ : undefined;
819
+
820
+ const result = await db
821
+ .select({ count: sql`count(*)::int` })
822
+ .from(schema.chemicals)
823
+ .where(whereClause);
824
+
825
+ return { count: result[0].count };
826
+ } catch (error) {
827
+ logError('pegasus-sdk', 'ChemicalsService', 'countChemicalsWithoutDocuments', error);
828
+ throw error;
829
+ }
830
+ }
831
+
832
+ _buildEsHandlers() {
833
+ return {
834
+ index: async (params) => {
835
+ const chemical = params.body;
836
+ const result = await this.createChemical(chemical);
837
+
838
+ return {
839
+ _index: params.index,
840
+ _id: result.chemicalId,
841
+ _version: 1,
842
+ result: 'created',
843
+ _source: result
844
+ };
845
+ },
846
+
847
+ bulk: async (params) => {
848
+ const operations = params.body || params.operations;
849
+
850
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Starting bulk operation with ${operations?.length || 0} total operations`);
851
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Params index: ${params.index}`);
852
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Operations array type: ${Array.isArray(operations) ? 'array' : typeof operations}`);
853
+
854
+ const cdiDocuments = [];
855
+ let cdiOpCount = 0;
856
+ let otherOpCount = 0;
857
+
858
+ for (let i = 0; i < operations.length; i++) {
859
+ const op = operations[i];
860
+ const isIndexOp = !!(op.index || op.create);
861
+ const indexName = op.index?._index || op.create?._index || op.delete?._index || op.update?._index;
862
+
863
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Op[${i}]: action=${Object.keys(op)[0] || 'unknown'}, index=${indexName}`);
864
+
865
+ if ((op.index || op.create) &&
866
+ (op.index?._index === 'chemical_data_index' || op.create?._index === 'chemical_data_index')) {
867
+ const doc = operations[i + 1];
868
+ const sourceId = op.index?._id || op.create?._id;
869
+
870
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Found CDI entry: sourceId=${sourceId}, hasDoc=${!!doc}`);
871
+
872
+ if (doc && sourceId) {
873
+ const cdiDoc = {
874
+ source_id: sourceId,
875
+ chemical_name: doc.chemical_primary_name || (doc.chemical_names && doc.chemical_names[0]) || null,
876
+ chemical_meta: doc.chemical_meta || {},
877
+ chemical_identifiers: doc.chemical_identifiers || {},
878
+ chemical_synonyms: doc.chemical_synonyms || [],
879
+ chemical_categories: doc.chemical_categories || [],
880
+ created_at: doc.chemical_created_at ? (typeof doc.chemical_created_at === 'string' ? new Date(doc.chemical_created_at) : doc.chemical_created_at) : new Date(),
881
+ updated_at: doc.chemical_updated_at ? (typeof doc.chemical_updated_at === 'string' ? new Date(doc.chemical_updated_at) : doc.chemical_updated_at) : new Date()
882
+ };
883
+ cdiDocuments.push(cdiDoc);
884
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Extracted CDI doc: ${JSON.stringify({ source_id: cdiDoc.source_id, chemical_name: cdiDoc.chemical_name })}`);
885
+ i++;
886
+ cdiOpCount++;
887
+ } else {
888
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] CDI entry incomplete: sourceId=${sourceId}, doc=${!!doc}`);
889
+ }
890
+ } else {
891
+ otherOpCount++;
892
+ }
893
+ }
894
+
895
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Scan complete: ${cdiOpCount} CDI docs found, ${otherOpCount} other operations skipped`);
896
+
897
+ if (cdiDocuments.length === 0) {
898
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] No CDI documents to index, returning empty no-op response`);
899
+ return { took: 0, errors: false, items: [] };
900
+ }
901
+
902
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Calling bulkIndexFielded with ${cdiDocuments.length} CDI documents`);
903
+
904
+ try {
905
+ const result = await this.bulkIndexFielded(cdiDocuments);
906
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] bulkIndexFielded returned: indexed=${result.indexed}, errors=${result.errors.length}`);
907
+
908
+ if (result.errors.length > 0) {
909
+ logError('pegasus-sdk', 'ChemicalsService.bulk', 'Errors during bulk indexing', result.errors);
910
+ }
911
+
912
+ return {
913
+ took: 1,
914
+ errors: result.errors.length > 0,
915
+ items: result.results.map((res, idx) => ({
916
+ index: {
917
+ _index: 'chemical_data_index',
918
+ _id: cdiDocuments[idx].source_id,
919
+ status: res.success ? 200 : 400,
920
+ result: res.success ? 'created' : 'error',
921
+ ...(res.success ? {} : { error: { type: 'mapper_parsing_exception', reason: res.error } })
922
+ }
923
+ }))
924
+ };
925
+ } catch (error) {
926
+ logError('pegasus-sdk', 'ChemicalsService.bulk', 'Fatal error during bulk indexing', error);
927
+ throw error;
928
+ }
929
+ },
930
+
931
+ get: async (params) => {
932
+ const result = await this.getChemicalBySourceId(params.id);
933
+
934
+ if (!result) {
935
+ return {
936
+ _index: params.index,
937
+ _id: params.id,
938
+ found: false
939
+ };
940
+ }
941
+
942
+ return {
943
+ _index: params.index,
944
+ _id: params.id,
945
+ _version: 1,
946
+ found: true,
947
+ _source: result
948
+ };
949
+ },
950
+
951
+ update: async (params) => {
952
+ const result = await this.updateChemical(params.id, params.body);
953
+
954
+ return {
955
+ _index: params.index,
956
+ _id: params.id,
957
+ _version: 2,
958
+ result: result ? 'updated' : 'noop',
959
+ _source: result
960
+ };
961
+ },
962
+
963
+ delete: async (params) => {
964
+ if (params.index === 'synonym_lookup_index') {
965
+ return { _index: params.index, _id: params.id, result: 'not_found' };
966
+ }
967
+ const result = await this.deleteBySourceId(params.id);
968
+
969
+ return {
970
+ _index: params.index,
971
+ _id: params.id,
972
+ result: result ? 'deleted' : 'not_found'
973
+ };
974
+ },
975
+
976
+ deleteByQuery: async (params) => {
977
+ const sourceId = params.body?.query?.term?.chemical_set_identifier
978
+ || params.body?.query?.term?.source_id;
979
+ if (!sourceId) {
980
+ return { deleted: 0, failures: [] };
981
+ }
982
+ const result = await this.deleteBySourceId(sourceId);
983
+ return {
984
+ deleted: result ? 1 : 0,
985
+ failures: []
986
+ };
987
+ },
988
+
989
+ search: async (params) => {
990
+ let searchTerm = '';
991
+ let limit = params.body?.size || 10;
992
+
993
+ if (params.index === 'synonym_lookup_index') {
994
+ const query = params.body?.query;
995
+ searchTerm = query?.match?.chemical_name ||
996
+ query?.term?.chemical_name ||
997
+ query?.query_string?.query || '';
998
+ const searchResults = await this.searchBySynonym(searchTerm, limit);
999
+
1000
+ return {
1001
+ took: 1,
1002
+ timed_out: false,
1003
+ _shards: {
1004
+ total: 1,
1005
+ successful: 1,
1006
+ skipped: 0,
1007
+ failed: 0
1008
+ },
1009
+ hits: {
1010
+ total: {
1011
+ value: searchResults.results.length,
1012
+ relation: 'eq'
1013
+ },
1014
+ max_score: searchResults.results[0]?.score || 0,
1015
+ hits: searchResults.results.map(result => ({
1016
+ _index: params.index,
1017
+ _id: result.id,
1018
+ _score: result.score,
1019
+ _source: {
1020
+ postgres_id: result.id,
1021
+ chemical_name: result.name,
1022
+ cas_numbers: result.cas,
1023
+ identifier_values: result.identifiers,
1024
+ synonyms: result.synonyms
1025
+ }
1026
+ }))
1027
+ }
1028
+ };
1029
+ } else {
1030
+ const query = params.body?.query;
1031
+ searchTerm = query?.match?.chemical_name ||
1032
+ query?.term?.chemical_name ||
1033
+ query?.query_string?.query || '';
1034
+ const searchResults = await this.searchByName(searchTerm, limit);
1035
+
1036
+ return {
1037
+ took: 1,
1038
+ timed_out: false,
1039
+ _shards: {
1040
+ total: 1,
1041
+ successful: 1,
1042
+ skipped: 0,
1043
+ failed: 0
1044
+ },
1045
+ hits: {
1046
+ total: {
1047
+ value: searchResults.results.length,
1048
+ relation: 'eq'
1049
+ },
1050
+ max_score: searchResults.results[0]?.score || 0,
1051
+ hits: searchResults.results.map(result => ({
1052
+ _index: params.index,
1053
+ _id: result.id,
1054
+ _score: result.score,
1055
+ _source: {
1056
+ postgres_id: result.id,
1057
+ chemical_name: result.name,
1058
+ cas_numbers: result.cas,
1059
+ identifier_values: result.identifiers,
1060
+ synonyms: result.synonyms
1061
+ }
1062
+ }))
1063
+ }
1064
+ };
1065
+ }
1066
+ },
1067
+
1068
+ count: async (params) => {
1069
+ if (params.index === 'synonym_lookup_index') {
1070
+ return await this.getTotalSynonymCount();
1071
+ }
1072
+ return await this.countAll();
1073
+ }
1074
+ };
1075
+ }
1076
+
1077
+ registerElasticsearchHandlers(elasticsearchService) {
1078
+ const configurablePatterns = this.connection.config.indexRoutes?.chemicals || ['chemicals*'];
1079
+ const legacyPatterns = ['synonym_lookup_index', 'chemical_data_index', 'chemical_converter_index'];
1080
+ const allPatterns = [...new Set([...configurablePatterns, ...legacyPatterns])];
1081
+ const handlers = this._buildEsHandlers();
1082
+ allPatterns.forEach(pattern => {
1083
+ elasticsearchService.registerIndexRoute(pattern, handlers);
1084
+ });
1085
+ }
1086
+ }
1087
+
1062
1088
  module.exports = ChemicalsService;