@yarkivaev/source-to-sink 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/postgresSink.js +54 -7
- package/test/test_postgresSink.js +36 -1
package/package.json
CHANGED
package/src/postgresSink.js
CHANGED
|
@@ -5,7 +5,8 @@ import pg from 'pg';
|
|
|
5
5
|
*
|
|
6
6
|
* Creates a pg.Pool internally and implements the Sink
|
|
7
7
|
* interface for use with batch collectors. Supports optional
|
|
8
|
-
* conflict resolution via ON CONFLICT DO NOTHING
|
|
8
|
+
* conflict resolution via ON CONFLICT DO NOTHING or
|
|
9
|
+
* ON CONFLICT DO UPDATE SET.
|
|
9
10
|
*
|
|
10
11
|
* @example
|
|
11
12
|
* const sink = postgresSink('postgresql://localhost:5432/db', 'metrics', ['ts', 'value']);
|
|
@@ -20,9 +21,55 @@ import pg from 'pg';
|
|
|
20
21
|
* @param {string} table - Target table name
|
|
21
22
|
* @param {Array<string>} columns - Column names for insertion
|
|
22
23
|
* @param {object} [options] - Optional configuration
|
|
23
|
-
* @param {Array<string>} [options.conflict] - Columns for ON CONFLICT
|
|
24
|
+
* @param {Array<string>} [options.conflict] - Columns for ON CONFLICT clause
|
|
25
|
+
* @param {Array<string>} [options.update] - Columns for DO UPDATE SET clause
|
|
24
26
|
* @returns {object} Sink with write(records) method
|
|
25
27
|
*/
|
|
28
|
+
/**
|
|
29
|
+
* Removes duplicate records by conflict key within a batch.
|
|
30
|
+
*
|
|
31
|
+
* PostgreSQL rejects INSERT batches where ON CONFLICT would update
|
|
32
|
+
* the same row twice. This keeps the last occurrence for each
|
|
33
|
+
* unique combination of conflict column values.
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* deduplicate(
|
|
37
|
+
* [{ machine: 'a', start: 1, name: 'pending' },
|
|
38
|
+
* { machine: 'a', start: 1, name: 'completed' }],
|
|
39
|
+
* ['machine', 'start']
|
|
40
|
+
* );
|
|
41
|
+
* // => [{ machine: 'a', start: 1, name: 'completed' }]
|
|
42
|
+
*
|
|
43
|
+
* @param {Array} records - Array of record objects
|
|
44
|
+
* @param {Array<string>} keys - Conflict column names
|
|
45
|
+
* @returns {Array} Deduplicated records
|
|
46
|
+
*/
|
|
47
|
+
export function deduplicate(records, keys) {
|
|
48
|
+
if (keys.length === 0) return records;
|
|
49
|
+
const seen = new Map();
|
|
50
|
+
for (const record of records) {
|
|
51
|
+
const key = keys.map(k => record[k]).join('\0');
|
|
52
|
+
seen.set(key, record);
|
|
53
|
+
}
|
|
54
|
+
return Array.from(seen.values());
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Builds the ON CONFLICT SQL suffix from options.
|
|
59
|
+
*
|
|
60
|
+
* @param {object} options - Sink options with conflict and update arrays
|
|
61
|
+
* @returns {string} SQL suffix or empty string
|
|
62
|
+
*/
|
|
63
|
+
function buildSuffix(options) {
|
|
64
|
+
if (!Array.isArray(options.conflict) || options.conflict.length === 0) return '';
|
|
65
|
+
const cols = options.conflict.join(', ');
|
|
66
|
+
if (Array.isArray(options.update) && options.update.length > 0) {
|
|
67
|
+
const sets = options.update.map(c => `${c} = EXCLUDED.${c}`).join(', ');
|
|
68
|
+
return ` ON CONFLICT (${cols}) DO UPDATE SET ${sets}`;
|
|
69
|
+
}
|
|
70
|
+
return ` ON CONFLICT (${cols}) DO NOTHING`;
|
|
71
|
+
}
|
|
72
|
+
|
|
26
73
|
export default function postgresSink(url, table, columns, options = {}) {
|
|
27
74
|
if (typeof url !== 'string' || url.length === 0) {
|
|
28
75
|
throw new Error('URL must be a non-empty string');
|
|
@@ -34,9 +81,8 @@ export default function postgresSink(url, table, columns, options = {}) {
|
|
|
34
81
|
throw new Error('Columns must be a non-empty array');
|
|
35
82
|
}
|
|
36
83
|
const pool = new pg.Pool({ connectionString: url });
|
|
37
|
-
const suffix =
|
|
38
|
-
|
|
39
|
-
: '';
|
|
84
|
+
const suffix = buildSuffix(options);
|
|
85
|
+
const conflict = Array.isArray(options.conflict) ? options.conflict : [];
|
|
40
86
|
return {
|
|
41
87
|
/**
|
|
42
88
|
* Writes records to PostgreSQL table.
|
|
@@ -45,13 +91,14 @@ export default function postgresSink(url, table, columns, options = {}) {
|
|
|
45
91
|
* @returns {Promise} Promise resolving when insert completes
|
|
46
92
|
*/
|
|
47
93
|
write(records) {
|
|
48
|
-
const
|
|
94
|
+
const unique = deduplicate(records, conflict);
|
|
95
|
+
const placeholders = unique.map((_, i) => {
|
|
49
96
|
const offset = i * columns.length;
|
|
50
97
|
const row = columns.map((__, j) => `$${offset + j + 1}`).join(', ');
|
|
51
98
|
return `(${row})`;
|
|
52
99
|
}).join(', ');
|
|
53
100
|
const query = `INSERT INTO ${table} (${columns.join(', ')}) VALUES ${placeholders}${suffix}`;
|
|
54
|
-
const values =
|
|
101
|
+
const values = unique.flatMap((record) => columns.map((col) => record[col]));
|
|
55
102
|
return pool.query(query, values);
|
|
56
103
|
}
|
|
57
104
|
};
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import assert from 'node:assert';
|
|
2
2
|
import { describe, it } from 'mocha';
|
|
3
|
-
import postgresSink from '../src/postgresSink.js';
|
|
3
|
+
import postgresSink, { deduplicate } from '../src/postgresSink.js';
|
|
4
4
|
|
|
5
5
|
describe('postgresSink', () => {
|
|
6
6
|
it('throws on missing url', () => {
|
|
@@ -73,4 +73,39 @@ describe('postgresSink', () => {
|
|
|
73
73
|
['ts', 'value'], {});
|
|
74
74
|
assert.strictEqual(typeof sink.write, 'function', 'Should have write method without conflict key');
|
|
75
75
|
});
|
|
76
|
+
|
|
77
|
+
it('returns sink with write method when conflict and update options are provided', () => {
|
|
78
|
+
const sink = postgresSink('postgresql://localhost:5432/db', 'segments',
|
|
79
|
+
['machine', 'name', 'start_time', 'end_time', 'duration'],
|
|
80
|
+
{ conflict: ['machine', 'start_time'], update: ['name', 'end_time', 'duration'] });
|
|
81
|
+
assert.strictEqual(typeof sink.write, 'function', 'Should have write method with update option');
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('deduplicates batch by conflict columns keeping last occurrence', () => {
|
|
85
|
+
const records = [
|
|
86
|
+
{ machine: 'ü-1', name: 'pending', start_time: '2024-01-01' },
|
|
87
|
+
{ machine: 'ü-1', name: 'completed', start_time: '2024-01-01' },
|
|
88
|
+
{ machine: 'ö-2', name: 'pending', start_time: '2024-01-02' }
|
|
89
|
+
];
|
|
90
|
+
const result = deduplicate(records, ['machine', 'start_time']);
|
|
91
|
+
assert.strictEqual(result.length, 2, 'Should remove duplicate conflict key');
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('keeps last record when conflict columns match', () => {
|
|
95
|
+
const records = [
|
|
96
|
+
{ machine: 'ü-1', name: 'pending', start_time: '2024-01-01' },
|
|
97
|
+
{ machine: 'ü-1', name: 'completed', start_time: '2024-01-01' }
|
|
98
|
+
];
|
|
99
|
+
const result = deduplicate(records, ['machine', 'start_time']);
|
|
100
|
+
assert.strictEqual(result[0].name, 'completed', 'Should keep the last occurrence');
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it('returns records unchanged when no conflict columns', () => {
|
|
104
|
+
const records = [
|
|
105
|
+
{ machine: 'ä-1', name: 'à', start_time: '2024-01-01' },
|
|
106
|
+
{ machine: 'ä-1', name: 'à', start_time: '2024-01-01' }
|
|
107
|
+
];
|
|
108
|
+
const result = deduplicate(records, []);
|
|
109
|
+
assert.strictEqual(result.length, 2, 'Should not deduplicate without conflict columns');
|
|
110
|
+
});
|
|
76
111
|
});
|