@yarkivaev/source-to-sink 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yarkivaev/source-to-sink",
3
- "version": "1.0.2",
3
+ "version": "1.0.4",
4
4
  "description": "Generic library for building data streaming pipelines",
5
5
  "repository": {
6
6
  "type": "git",
@@ -5,7 +5,8 @@ import pg from 'pg';
5
5
  *
6
6
  * Creates a pg.Pool internally and implements the Sink
7
7
  * interface for use with batch collectors. Supports optional
8
- * conflict resolution via ON CONFLICT DO NOTHING.
8
+ * conflict resolution via ON CONFLICT DO NOTHING or
9
+ * ON CONFLICT DO UPDATE SET.
9
10
  *
10
11
  * @example
11
12
  * const sink = postgresSink('postgresql://localhost:5432/db', 'metrics', ['ts', 'value']);
@@ -20,9 +21,55 @@ import pg from 'pg';
20
21
  * @param {string} table - Target table name
21
22
  * @param {Array<string>} columns - Column names for insertion
22
23
  * @param {object} [options] - Optional configuration
23
- * @param {Array<string>} [options.conflict] - Columns for ON CONFLICT DO NOTHING clause
24
+ * @param {Array<string>} [options.conflict] - Columns for ON CONFLICT clause
25
+ * @param {Array<string>} [options.update] - Columns for DO UPDATE SET clause
24
26
  * @returns {object} Sink with write(records) method
25
27
  */
28
+ /**
29
+ * Removes duplicate records by conflict key within a batch.
30
+ *
31
+ * PostgreSQL rejects INSERT batches where ON CONFLICT would update
32
+ * the same row twice. This keeps the last occurrence for each
33
+ * unique combination of conflict column values.
34
+ *
35
+ * @example
36
+ * deduplicate(
37
+ * [{ machine: 'a', start: 1, name: 'pending' },
38
+ * { machine: 'a', start: 1, name: 'completed' }],
39
+ * ['machine', 'start']
40
+ * );
41
+ * // => [{ machine: 'a', start: 1, name: 'completed' }]
42
+ *
43
+ * @param {Array} records - Array of record objects
44
+ * @param {Array<string>} keys - Conflict column names
45
+ * @returns {Array} Deduplicated records
46
+ */
47
+ export function deduplicate(records, keys) {
48
+ if (keys.length === 0) return records;
49
+ const seen = new Map();
50
+ for (const record of records) {
51
+ const key = keys.map(k => record[k]).join('\0');
52
+ seen.set(key, record);
53
+ }
54
+ return Array.from(seen.values());
55
+ }
56
+
57
+ /**
58
+ * Builds the ON CONFLICT SQL suffix from options.
59
+ *
60
+ * @param {object} options - Sink options with conflict and update arrays
61
+ * @returns {string} SQL suffix or empty string
62
+ */
63
+ function buildSuffix(options) {
64
+ if (!Array.isArray(options.conflict) || options.conflict.length === 0) return '';
65
+ const cols = options.conflict.join(', ');
66
+ if (Array.isArray(options.update) && options.update.length > 0) {
67
+ const sets = options.update.map(c => `${c} = EXCLUDED.${c}`).join(', ');
68
+ return ` ON CONFLICT (${cols}) DO UPDATE SET ${sets}`;
69
+ }
70
+ return ` ON CONFLICT (${cols}) DO NOTHING`;
71
+ }
72
+
26
73
  export default function postgresSink(url, table, columns, options = {}) {
27
74
  if (typeof url !== 'string' || url.length === 0) {
28
75
  throw new Error('URL must be a non-empty string');
@@ -34,9 +81,8 @@ export default function postgresSink(url, table, columns, options = {}) {
34
81
  throw new Error('Columns must be a non-empty array');
35
82
  }
36
83
  const pool = new pg.Pool({ connectionString: url });
37
- const suffix = Array.isArray(options.conflict) && options.conflict.length > 0
38
- ? ` ON CONFLICT (${options.conflict.join(', ')}) DO NOTHING`
39
- : '';
84
+ const suffix = buildSuffix(options);
85
+ const conflict = Array.isArray(options.conflict) ? options.conflict : [];
40
86
  return {
41
87
  /**
42
88
  * Writes records to PostgreSQL table.
@@ -45,13 +91,14 @@ export default function postgresSink(url, table, columns, options = {}) {
45
91
  * @returns {Promise} Promise resolving when insert completes
46
92
  */
47
93
  write(records) {
48
- const placeholders = records.map((_, i) => {
94
+ const unique = deduplicate(records, conflict);
95
+ const placeholders = unique.map((_, i) => {
49
96
  const offset = i * columns.length;
50
97
  const row = columns.map((__, j) => `$${offset + j + 1}`).join(', ');
51
98
  return `(${row})`;
52
99
  }).join(', ');
53
100
  const query = `INSERT INTO ${table} (${columns.join(', ')}) VALUES ${placeholders}${suffix}`;
54
- const values = records.flatMap((record) => columns.map((col) => record[col]));
101
+ const values = unique.flatMap((record) => columns.map((col) => record[col]));
55
102
  return pool.query(query, values);
56
103
  }
57
104
  };
@@ -1,6 +1,6 @@
1
1
  import assert from 'node:assert';
2
2
  import { describe, it } from 'mocha';
3
- import postgresSink from '../src/postgresSink.js';
3
+ import postgresSink, { deduplicate } from '../src/postgresSink.js';
4
4
 
5
5
  describe('postgresSink', () => {
6
6
  it('throws on missing url', () => {
@@ -73,4 +73,39 @@ describe('postgresSink', () => {
73
73
  ['ts', 'value'], {});
74
74
  assert.strictEqual(typeof sink.write, 'function', 'Should have write method without conflict key');
75
75
  });
76
+
77
+ it('returns sink with write method when conflict and update options are provided', () => {
78
+ const sink = postgresSink('postgresql://localhost:5432/db', 'segments',
79
+ ['machine', 'name', 'start_time', 'end_time', 'duration'],
80
+ { conflict: ['machine', 'start_time'], update: ['name', 'end_time', 'duration'] });
81
+ assert.strictEqual(typeof sink.write, 'function', 'Should have write method with update option');
82
+ });
83
+
84
+ it('deduplicates batch by conflict columns keeping last occurrence', () => {
85
+ const records = [
86
+ { machine: 'ü-1', name: 'pending', start_time: '2024-01-01' },
87
+ { machine: 'ü-1', name: 'completed', start_time: '2024-01-01' },
88
+ { machine: 'ö-2', name: 'pending', start_time: '2024-01-02' }
89
+ ];
90
+ const result = deduplicate(records, ['machine', 'start_time']);
91
+ assert.strictEqual(result.length, 2, 'Should remove duplicate conflict key');
92
+ });
93
+
94
+ it('keeps last record when conflict columns match', () => {
95
+ const records = [
96
+ { machine: 'ü-1', name: 'pending', start_time: '2024-01-01' },
97
+ { machine: 'ü-1', name: 'completed', start_time: '2024-01-01' }
98
+ ];
99
+ const result = deduplicate(records, ['machine', 'start_time']);
100
+ assert.strictEqual(result[0].name, 'completed', 'Should keep the last occurrence');
101
+ });
102
+
103
+ it('returns records unchanged when no conflict columns', () => {
104
+ const records = [
105
+ { machine: 'ä-1', name: 'à', start_time: '2024-01-01' },
106
+ { machine: 'ä-1', name: 'à', start_time: '2024-01-01' }
107
+ ];
108
+ const result = deduplicate(records, []);
109
+ assert.strictEqual(result.length, 2, 'Should not deduplicate without conflict columns');
110
+ });
76
111
  });