@yarkivaev/source-to-sink 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/postgresSink.js +33 -2
- package/test/test_postgresSink.js +29 -1
package/package.json
CHANGED
package/src/postgresSink.js
CHANGED
|
@@ -25,6 +25,35 @@ import pg from 'pg';
|
|
|
25
25
|
* @param {Array<string>} [options.update] - Columns for DO UPDATE SET clause
|
|
26
26
|
* @returns {object} Sink with write(records) method
|
|
27
27
|
*/
|
|
28
|
+
/**
|
|
29
|
+
* Removes duplicate records by conflict key within a batch.
|
|
30
|
+
*
|
|
31
|
+
* PostgreSQL rejects INSERT batches where ON CONFLICT would update
|
|
32
|
+
* the same row twice. This keeps the last occurrence for each
|
|
33
|
+
* unique combination of conflict column values.
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* deduplicate(
|
|
37
|
+
* [{ machine: 'a', start: 1, name: 'pending' },
|
|
38
|
+
* { machine: 'a', start: 1, name: 'completed' }],
|
|
39
|
+
* ['machine', 'start']
|
|
40
|
+
* );
|
|
41
|
+
* // => [{ machine: 'a', start: 1, name: 'completed' }]
|
|
42
|
+
*
|
|
43
|
+
* @param {Array} records - Array of record objects
|
|
44
|
+
* @param {Array<string>} keys - Conflict column names
|
|
45
|
+
* @returns {Array} Deduplicated records
|
|
46
|
+
*/
|
|
47
|
+
export function deduplicate(records, keys) {
|
|
48
|
+
if (keys.length === 0) return records;
|
|
49
|
+
const seen = new Map();
|
|
50
|
+
for (const record of records) {
|
|
51
|
+
const key = keys.map(k => record[k]).join('\0');
|
|
52
|
+
seen.set(key, record);
|
|
53
|
+
}
|
|
54
|
+
return Array.from(seen.values());
|
|
55
|
+
}
|
|
56
|
+
|
|
28
57
|
/**
|
|
29
58
|
* Builds the ON CONFLICT SQL suffix from options.
|
|
30
59
|
*
|
|
@@ -53,6 +82,7 @@ export default function postgresSink(url, table, columns, options = {}) {
|
|
|
53
82
|
}
|
|
54
83
|
const pool = new pg.Pool({ connectionString: url });
|
|
55
84
|
const suffix = buildSuffix(options);
|
|
85
|
+
const conflict = Array.isArray(options.conflict) ? options.conflict : [];
|
|
56
86
|
return {
|
|
57
87
|
/**
|
|
58
88
|
* Writes records to PostgreSQL table.
|
|
@@ -61,13 +91,14 @@ export default function postgresSink(url, table, columns, options = {}) {
|
|
|
61
91
|
* @returns {Promise} Promise resolving when insert completes
|
|
62
92
|
*/
|
|
63
93
|
write(records) {
|
|
64
|
-
const
|
|
94
|
+
const unique = deduplicate(records, conflict);
|
|
95
|
+
const placeholders = unique.map((_, i) => {
|
|
65
96
|
const offset = i * columns.length;
|
|
66
97
|
const row = columns.map((__, j) => `$${offset + j + 1}`).join(', ');
|
|
67
98
|
return `(${row})`;
|
|
68
99
|
}).join(', ');
|
|
69
100
|
const query = `INSERT INTO ${table} (${columns.join(', ')}) VALUES ${placeholders}${suffix}`;
|
|
70
|
-
const values =
|
|
101
|
+
const values = unique.flatMap((record) => columns.map((col) => record[col]));
|
|
71
102
|
return pool.query(query, values);
|
|
72
103
|
}
|
|
73
104
|
};
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import assert from 'node:assert';
|
|
2
2
|
import { describe, it } from 'mocha';
|
|
3
|
-
import postgresSink from '../src/postgresSink.js';
|
|
3
|
+
import postgresSink, { deduplicate } from '../src/postgresSink.js';
|
|
4
4
|
|
|
5
5
|
describe('postgresSink', () => {
|
|
6
6
|
it('throws on missing url', () => {
|
|
@@ -80,4 +80,32 @@ describe('postgresSink', () => {
|
|
|
80
80
|
{ conflict: ['machine', 'start_time'], update: ['name', 'end_time', 'duration'] });
|
|
81
81
|
assert.strictEqual(typeof sink.write, 'function', 'Should have write method with update option');
|
|
82
82
|
});
|
|
83
|
+
|
|
84
|
+
it('deduplicates batch by conflict columns keeping last occurrence', () => {
|
|
85
|
+
const records = [
|
|
86
|
+
{ machine: 'ü-1', name: 'pending', start_time: '2024-01-01' },
|
|
87
|
+
{ machine: 'ü-1', name: 'completed', start_time: '2024-01-01' },
|
|
88
|
+
{ machine: 'ö-2', name: 'pending', start_time: '2024-01-02' }
|
|
89
|
+
];
|
|
90
|
+
const result = deduplicate(records, ['machine', 'start_time']);
|
|
91
|
+
assert.strictEqual(result.length, 2, 'Should remove duplicate conflict key');
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('keeps last record when conflict columns match', () => {
|
|
95
|
+
const records = [
|
|
96
|
+
{ machine: 'ü-1', name: 'pending', start_time: '2024-01-01' },
|
|
97
|
+
{ machine: 'ü-1', name: 'completed', start_time: '2024-01-01' }
|
|
98
|
+
];
|
|
99
|
+
const result = deduplicate(records, ['machine', 'start_time']);
|
|
100
|
+
assert.strictEqual(result[0].name, 'completed', 'Should keep the last occurrence');
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it('returns records unchanged when no conflict columns', () => {
|
|
104
|
+
const records = [
|
|
105
|
+
{ machine: 'ä-1', name: 'à', start_time: '2024-01-01' },
|
|
106
|
+
{ machine: 'ä-1', name: 'à', start_time: '2024-01-01' }
|
|
107
|
+
];
|
|
108
|
+
const result = deduplicate(records, []);
|
|
109
|
+
assert.strictEqual(result.length, 2, 'Should not deduplicate without conflict columns');
|
|
110
|
+
});
|
|
83
111
|
});
|