@sanity/export 6.0.3 → 6.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * CLI tool to detect potentially corrupted export files caused by UTF-8
5
+ * multi-byte characters being split across chunk boundaries during streaming.
6
+ *
7
+ * Usage:
8
+ * npx @sanity/export detect-corrupt <file.ndjson|file.tar.gz|directory>
9
+ */
10
+
11
+ import {existsSync} from 'node:fs'
12
+ import {detectCorruption} from '../dist/detectCorruption.js'
13
+
14
+ const REPLACEMENT_CHAR_DISPLAY = '�'
15
+
16
+ function printUsage() {
17
+ console.log(`
18
+ Usage: detect-corrupt <file.ndjson|file.tar.gz|directory>
19
+
20
+ Detects potentially corrupted export files caused by UTF-8 multi-byte
21
+ characters being split across chunk boundaries during streaming.
22
+
23
+ The corruption manifests as U+FFFD replacement characters (${REPLACEMENT_CHAR_DISPLAY}) appearing
24
+ where valid multi-byte characters should be.
25
+
26
+ Supported inputs:
27
+ - .tar.gz or .tgz archive (scans data.ndjson and assets.json inside)
28
+ - .ndjson file
29
+ - Directory containing data.ndjson and/or assets.json
30
+
31
+ Examples:
32
+ npx @sanity/export detect-corrupt export.tar.gz
33
+ npx @sanity/export detect-corrupt data.ndjson
34
+ npx @sanity/export detect-corrupt ./my-export-folder
35
+ `)
36
+ }
37
+
38
+ function printReport(filename, corruptions) {
39
+ console.log(`\n ${filename}:`)
40
+
41
+ // Limit output to avoid overwhelming terminal
42
+ const maxToShow = 10
43
+ const shown = corruptions.slice(0, maxToShow)
44
+
45
+ for (const c of shown) {
46
+ console.log(` Line ${c.line}, col ${c.column}: ${c.count} replacement char(s)`)
47
+ // Escape the context for display
48
+ const displayContext = c.context
49
+ .replace(/\n/g, '\\n')
50
+ .replace(/\r/g, '\\r')
51
+ .replace(/\t/g, '\\t')
52
+ console.log(` Context: "...${displayContext}..."`)
53
+ }
54
+
55
+ if (corruptions.length > maxToShow) {
56
+ console.log(` ... and ${corruptions.length - maxToShow} more occurrences`)
57
+ }
58
+ }
59
+
60
+ async function main() {
61
+ const args = process.argv.slice(2)
62
+
63
+ if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
64
+ printUsage()
65
+ process.exit(0)
66
+ }
67
+
68
+ const filePath = args[0]
69
+
70
+ if (!existsSync(filePath)) {
71
+ console.error(`Error: File not found: ${filePath}`)
72
+ process.exit(1)
73
+ }
74
+
75
+ console.log(`Scanning ${filePath} for UTF-8 corruption...`)
76
+
77
+ try {
78
+ const result = await detectCorruption(filePath)
79
+
80
+ // Show which files were scanned
81
+ if (result.scannedFiles.length > 0) {
82
+ console.log(`\nScanned files:`)
83
+ for (const file of result.scannedFiles) {
84
+ console.log(` - ${file}`)
85
+ }
86
+ }
87
+
88
+ if (!result.corrupted) {
89
+ console.log('\n✓ No corruption detected')
90
+ process.exit(0)
91
+ }
92
+
93
+ console.log(`\n✗ Found potential corruption in ${result.files.size} file(s):`)
94
+
95
+ for (const [filename, corruptions] of result.files) {
96
+ printReport(filename, corruptions)
97
+ }
98
+
99
+ console.log(`\nTotal: ${result.totalCorruptedLines} line(s) with replacement characters`)
100
+ console.log('\nNote: U+FFFD replacement characters indicate where multi-byte')
101
+ console.log('UTF-8 sequences were corrupted during export streaming.')
102
+ process.exit(1)
103
+ } catch (err) {
104
+ console.error('Error:', err instanceof Error ? err.message : String(err))
105
+ process.exit(1)
106
+ }
107
+ }
108
+
109
+ main()
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Information about corruption found on a specific line
3
+ * @public
4
+ */
5
+ export interface CorruptionInfo {
6
+ /** Line number (1-indexed) */
7
+ line: number;
8
+ /** Column position of first replacement char */
9
+ column: number;
10
+ /** Surrounding text for context */
11
+ context: string;
12
+ /** Number of replacement chars on this line */
13
+ count: number;
14
+ }
15
+ /**
16
+ * Result of scanning a file for corruption
17
+ * @public
18
+ */
19
+ export interface ScanResult {
20
+ /** Whether corruption was detected */
21
+ corrupted: boolean;
22
+ /** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
23
+ files: Map<string, CorruptionInfo[]>;
24
+ /** Total number of corrupted lines across all files */
25
+ totalCorruptedLines: number;
26
+ /** List of files that were scanned */
27
+ scannedFiles: string[];
28
+ }
29
+ /**
30
+ * Scans an NDJSON file for UTF-8 corruption
31
+ *
32
+ * @param filePath - Path to the ndjson file
33
+ * @returns Scan result with corruption information
34
+ * @public
35
+ */
36
+ export declare function scanNdjsonFile(filePath: string): Promise<ScanResult>;
37
+ /**
38
+ * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
39
+ *
40
+ * @param filePath - Path to the tar.gz file
41
+ * @returns Scan result with corruption information
42
+ * @public
43
+ */
44
+ export declare function scanTarGz(filePath: string): Promise<ScanResult>;
45
+ /**
46
+ * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
47
+ *
48
+ * @param dirPath - Path to the directory
49
+ * @returns Scan result with corruption information
50
+ * @public
51
+ */
52
+ export declare function scanDirectory(dirPath: string): Promise<ScanResult>;
53
+ /**
54
+ * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
55
+ *
56
+ * The corruption manifests as U+FFFD replacement characters appearing
57
+ * where valid multi-byte characters (CJK, emoji, etc.) should be.
58
+ *
59
+ * @param filePath - Path to the file or directory to scan
60
+ * @returns Scan result with corruption information
61
+ * @public
62
+ */
63
+ export declare function detectCorruption(filePath: string): Promise<ScanResult>;
64
+ //# sourceMappingURL=detectCorruption.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detectCorruption.d.ts","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAWA;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC7B,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAA;IACZ,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAA;IACd,mCAAmC;IACnC,OAAO,EAAE,MAAM,CAAA;IACf,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAA;IAClB,qFAAqF;IACrF,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAA;IACpC,uDAAuD;IACvD,mBAAmB,EAAE,MAAM,CAAA;IAC3B,sCAAsC;IACtC,YAAY,EAAE,MAAM,EAAE,CAAA;CACvB;AAmDD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAe1E;AAED;;;;;;GAMG;AACH,wBAAsB,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAsErE;AAED;;;;;;GAMG;AACH,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAoCxE;AAED;;;;;;;;;GASG;AACH,wBAAsB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAS5E"}
@@ -0,0 +1,193 @@
1
+ import { createReadStream, existsSync, statSync } from 'node:fs';
2
+ import { basename, join } from 'node:path';
3
+ import { createInterface } from 'node:readline';
4
+ import { createGunzip } from 'node:zlib';
5
+ import tarStream from 'tar-stream';
6
+ // U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
7
+ const REPLACEMENT_CHAR = '\uFFFD';
8
+ /**
9
+ * Scans a line for U+FFFD replacement characters
10
+ */
11
+ function scanLine(line, lineNumber) {
12
+ const index = line.indexOf(REPLACEMENT_CHAR);
13
+ if (index === -1)
14
+ return null;
15
+ // Count total replacement chars on this line
16
+ let count = 0;
17
+ for (const char of line) {
18
+ if (char === REPLACEMENT_CHAR)
19
+ count++;
20
+ }
21
+ // Extract context around the corruption
22
+ const contextStart = Math.max(0, index - 20);
23
+ const contextEnd = Math.min(line.length, index + 30);
24
+ const context = line.slice(contextStart, contextEnd);
25
+ return {
26
+ line: lineNumber,
27
+ column: index + 1,
28
+ context,
29
+ count,
30
+ };
31
+ }
32
+ /**
33
+ * Scans a readable stream (expecting UTF-8 text) for corruption
34
+ */
35
+ async function scanStream(stream) {
36
+ const corruptions = [];
37
+ let lineNumber = 0;
38
+ const rl = createInterface({
39
+ input: stream,
40
+ crlfDelay: Infinity,
41
+ });
42
+ for await (const line of rl) {
43
+ lineNumber++;
44
+ const corruption = scanLine(line, lineNumber);
45
+ if (corruption) {
46
+ corruptions.push(corruption);
47
+ }
48
+ }
49
+ return corruptions;
50
+ }
51
+ /**
52
+ * Scans an NDJSON file for UTF-8 corruption
53
+ *
54
+ * @param filePath - Path to the ndjson file
55
+ * @returns Scan result with corruption information
56
+ * @public
57
+ */
58
+ export async function scanNdjsonFile(filePath) {
59
+ const stream = createReadStream(filePath, { encoding: 'utf8' });
60
+ const corruptions = await scanStream(stream);
61
+ const files = new Map();
62
+ if (corruptions.length > 0) {
63
+ files.set(filePath, corruptions);
64
+ }
65
+ return {
66
+ corrupted: corruptions.length > 0,
67
+ files,
68
+ totalCorruptedLines: corruptions.length,
69
+ scannedFiles: [filePath],
70
+ };
71
+ }
72
+ /**
73
+ * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
74
+ *
75
+ * @param filePath - Path to the tar.gz file
76
+ * @returns Scan result with corruption information
77
+ * @public
78
+ */
79
+ export async function scanTarGz(filePath) {
80
+ const extract = tarStream.extract();
81
+ const results = new Map();
82
+ const scannedFiles = [];
83
+ const targetFiles = ['data.ndjson', 'asset.json'];
84
+ return new Promise((resolve, reject) => {
85
+ extract.on('entry', (header, stream, next) => {
86
+ const fileBasename = basename(header.name);
87
+ if (targetFiles.includes(fileBasename)) {
88
+ scannedFiles.push(header.name);
89
+ const chunks = [];
90
+ stream.on('data', (chunk) => {
91
+ chunks.push(chunk);
92
+ });
93
+ stream.on('end', () => {
94
+ // Combine all chunks and convert to string
95
+ const content = Buffer.concat(chunks).toString('utf8');
96
+ const lines = content.split(/\r?\n/);
97
+ const corruptions = [];
98
+ for (let i = 0; i < lines.length; i++) {
99
+ const line = lines[i];
100
+ if (line !== undefined && line.length > 0) {
101
+ const corruption = scanLine(line, i + 1);
102
+ if (corruption) {
103
+ corruptions.push(corruption);
104
+ }
105
+ }
106
+ }
107
+ if (corruptions.length > 0) {
108
+ results.set(header.name, corruptions);
109
+ }
110
+ next();
111
+ });
112
+ stream.on('error', reject);
113
+ }
114
+ else {
115
+ // Skip this entry
116
+ stream.on('end', next);
117
+ stream.resume();
118
+ }
119
+ });
120
+ extract.on('finish', () => {
121
+ let totalCorruptedLines = 0;
122
+ for (const corruptions of results.values()) {
123
+ totalCorruptedLines += corruptions.length;
124
+ }
125
+ resolve({
126
+ corrupted: results.size > 0,
127
+ files: results,
128
+ totalCorruptedLines,
129
+ scannedFiles,
130
+ });
131
+ });
132
+ extract.on('error', reject);
133
+ const gunzip = createGunzip();
134
+ gunzip.on('error', reject);
135
+ createReadStream(filePath).pipe(gunzip).pipe(extract);
136
+ });
137
+ }
138
+ /**
139
+ * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
140
+ *
141
+ * @param dirPath - Path to the directory
142
+ * @returns Scan result with corruption information
143
+ * @public
144
+ */
145
+ export async function scanDirectory(dirPath) {
146
+ const targetFiles = ['data.ndjson', 'assets.json'];
147
+ const foundFiles = [];
148
+ for (const filename of targetFiles) {
149
+ const filePath = join(dirPath, filename);
150
+ if (existsSync(filePath)) {
151
+ foundFiles.push(filePath);
152
+ }
153
+ }
154
+ if (foundFiles.length === 0) {
155
+ throw new Error(`No data.ndjson or assets.json found in directory: ${dirPath}`);
156
+ }
157
+ const results = new Map();
158
+ const scannedFiles = [];
159
+ let totalCorruptedLines = 0;
160
+ for (const filePath of foundFiles) {
161
+ const result = await scanNdjsonFile(filePath);
162
+ scannedFiles.push(...result.scannedFiles);
163
+ for (const [file, corruptions] of result.files) {
164
+ results.set(file, corruptions);
165
+ totalCorruptedLines += corruptions.length;
166
+ }
167
+ }
168
+ return {
169
+ corrupted: results.size > 0,
170
+ files: results,
171
+ totalCorruptedLines,
172
+ scannedFiles,
173
+ };
174
+ }
175
+ /**
176
+ * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
177
+ *
178
+ * The corruption manifests as U+FFFD replacement characters appearing
179
+ * where valid multi-byte characters (CJK, emoji, etc.) should be.
180
+ *
181
+ * @param filePath - Path to the file or directory to scan
182
+ * @returns Scan result with corruption information
183
+ * @public
184
+ */
185
+ export async function detectCorruption(filePath) {
186
+ const stat = statSync(filePath);
187
+ if (stat.isDirectory()) {
188
+ return scanDirectory(filePath);
189
+ }
190
+ const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz');
191
+ return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath);
192
+ }
193
+ //# sourceMappingURL=detectCorruption.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detectCorruption.js","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,SAAS,CAAA;AAC9D,OAAO,EAAC,QAAQ,EAAE,IAAI,EAAC,MAAM,WAAW,CAAA;AACxC,OAAO,EAAC,eAAe,EAAC,MAAM,eAAe,CAAA;AAE7C,OAAO,EAAC,YAAY,EAAC,MAAM,WAAW,CAAA;AAEtC,OAAO,SAAS,MAAM,YAAY,CAAA;AAElC,kFAAkF;AAClF,MAAM,gBAAgB,GAAG,QAAQ,CAAA;AAgCjC;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,UAAkB;IAChD,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;IAC5C,IAAI,KAAK,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAE7B,6CAA6C;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI,KAAK,gBAAgB;YAAE,KAAK,EAAE,CAAA;IACxC,CAAC;IAED,wCAAwC;IACxC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IACpD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,UAAU,CAAC,CAAA;IAEpD,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,MAAM,EAAE,KAAK,GAAG,CAAC;QACjB,OAAO;QACP,KAAK;KACN,CAAA;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,UAAU,CAAC,MAAgB;IACxC,MAAM,WAAW,GAAqB,EAAE,CAAA;IACxC,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,MAAM,EAAE,GAAG,eAAe,CAAC;QACzB,KAAK,EAAE,MAAM;QACb,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;IAEF,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,UAAU,EAAE,CAAA;QACZ,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;QAC7C,IAAI,UAAU,EAAE,CAAC;YACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAA;AACpB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,QAAgB;IACnD,MAAM,MAAM,GAAG,gBAAgB,CAAC,QAAQ,EAAE,EAAC,QAAQ,EAAE,MAAM,EAAC,CAAC,CAAA;IAC7D,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,MAAM,CAAC,CAAA;IAE5C,MAAM,KAAK,GAAG,IAAI,GAAG,EAA4B,CAAA;IACjD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IAClC,CAAC;IAED,OAAO;QACL,SAAS,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;QACjC,KAAK;QACL,mBAAmB,EAAE,WAAW,CAAC,MAAM;QACvC,YAAY,EAAE,CAAC,QAAQ,CAAC;KACzB,CAAA;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,QAAgB;IAC9C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,EAAE,CAAA;IAEnC,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,YAAY,CAAC,CAAA;IAEjD,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE;YAC3C,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YAE1C,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACvC,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;gBAC9B,MAAM,MAAM,GAAa,EAAE,CAAA;gBAE3B,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;oBAClC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;gBACpB,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE;oBACpB,2CAA2C;oBAC3C,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;oBACtD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;oBACpC,MAAM,WAAW,GAAqB,EAAE,CAAA;oBAExC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;wBACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;wBACrB,IAAI,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;4BAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAA;4BACxC,IAAI,UAAU,EAAE,CAAC;gCACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC3B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;oBACvC,CAAC;oBACD,IAAI,EAAE,CAAA;gBACR,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;YAC5B,CAAC;iBAAM,CAAC;gBACN,kBAAkB;gBAClB,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,CAAA;gBACtB,MAAM,CAAC,MAAM,EAAE,CAAA;YACjB,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;YACxB,IAAI,mBAAmB,GAAG,CAAC,CAAA;YAC3B,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;gBAC3C,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;YAC3C,CAAC;YAED,OAAO,CAAC;gBACN,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;gBAC3B,KAAK,EAAE,OAAO;gBACd,mBAAmB;gBACnB,YAAY;aACb,CAAC,CAAA;QACJ,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE3B,MAAM,MAAM,GAAG,YAAY,EAAE,CAAA;QAC7B,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE1B,gBAAgB,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,OAAe;IACjD,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,aAAa,CAAC,CAAA;IAClD,MAAM,UAAU,GAAa,EAAE,CAAA;IAE/B,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAA;QACxC,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,qDAAqD,OAAO,EAAE,CAC/D,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,IAAI,mBAAmB,GAAG,CAAC,CAAA;IAE3B,KAAK,MAAM,QAAQ,IAAI,UAAU,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAA;QAC7C,YAAY,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,YAAY,CAAC,CAAA;QACzC,KAAK,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;YAC9B,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;QAC3C,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;QAC3B,KAAK,EAAE,OAAO;QACd,mBAAmB;QACnB,YAAY;KACb,CAAA;AACH,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IACrD,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAA;IAE/B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;QACvB,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAA;IAChC,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;IACxE,OAAO,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAA;AAChE,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/export",
3
- "version": "6.0.3",
3
+ "version": "6.0.4",
4
4
  "description": "Export Sanity documents and assets",
5
5
  "keywords": [
6
6
  "sanity",
@@ -30,7 +30,11 @@
30
30
  },
31
31
  "main": "./dist/index.js",
32
32
  "types": "./dist/index.d.ts",
33
+ "bin": {
34
+ "detect-corrupt": "./bin/detect-corrupt.js"
35
+ },
33
36
  "files": [
37
+ "bin",
34
38
  "dist",
35
39
  "src"
36
40
  ],
@@ -47,7 +51,8 @@
47
51
  "debug": "^4.3.4",
48
52
  "get-it": "^8.6.10",
49
53
  "json-stream-stringify": "^3.1.6",
50
- "p-queue": "^9.0.1"
54
+ "p-queue": "^9.0.1",
55
+ "tar-stream": "^3.1.7"
51
56
  },
52
57
  "devDependencies": {
53
58
  "@eslint/js": "^9.39.1",
@@ -56,6 +61,7 @@
56
61
  "@types/archiver": "^7.0.0",
57
62
  "@types/debug": "^4.1.12",
58
63
  "@types/node": "^20.19.0",
64
+ "@types/tar-stream": "^3.1.3",
59
65
  "@vitest/coverage-v8": "^4.0.15",
60
66
  "eslint": "^8.57.0",
61
67
  "eslint-config-prettier": "^9.1.0",
@@ -0,0 +1,258 @@
1
+ import {createReadStream, existsSync, statSync} from 'node:fs'
2
+ import {basename, join} from 'node:path'
3
+ import {createInterface} from 'node:readline'
4
+ import type {Readable} from 'node:stream'
5
+ import {createGunzip} from 'node:zlib'
6
+
7
+ import tarStream from 'tar-stream'
8
+
9
+ // U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
10
+ const REPLACEMENT_CHAR = '\uFFFD'
11
+
12
+ /**
13
+ * Information about corruption found on a specific line
14
+ * @public
15
+ */
16
+ export interface CorruptionInfo {
17
+ /** Line number (1-indexed) */
18
+ line: number
19
+ /** Column position of first replacement char */
20
+ column: number
21
+ /** Surrounding text for context */
22
+ context: string
23
+ /** Number of replacement chars on this line */
24
+ count: number
25
+ }
26
+
27
+ /**
28
+ * Result of scanning a file for corruption
29
+ * @public
30
+ */
31
+ export interface ScanResult {
32
+ /** Whether corruption was detected */
33
+ corrupted: boolean
34
+ /** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
35
+ files: Map<string, CorruptionInfo[]>
36
+ /** Total number of corrupted lines across all files */
37
+ totalCorruptedLines: number
38
+ /** List of files that were scanned */
39
+ scannedFiles: string[]
40
+ }
41
+
42
+ /**
43
+ * Scans a line for U+FFFD replacement characters
44
+ */
45
+ function scanLine(line: string, lineNumber: number): CorruptionInfo | null {
46
+ const index = line.indexOf(REPLACEMENT_CHAR)
47
+ if (index === -1) return null
48
+
49
+ // Count total replacement chars on this line
50
+ let count = 0
51
+ for (const char of line) {
52
+ if (char === REPLACEMENT_CHAR) count++
53
+ }
54
+
55
+ // Extract context around the corruption
56
+ const contextStart = Math.max(0, index - 20)
57
+ const contextEnd = Math.min(line.length, index + 30)
58
+ const context = line.slice(contextStart, contextEnd)
59
+
60
+ return {
61
+ line: lineNumber,
62
+ column: index + 1,
63
+ context,
64
+ count,
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Scans a readable stream (expecting UTF-8 text) for corruption
70
+ */
71
+ async function scanStream(stream: Readable): Promise<CorruptionInfo[]> {
72
+ const corruptions: CorruptionInfo[] = []
73
+ let lineNumber = 0
74
+
75
+ const rl = createInterface({
76
+ input: stream,
77
+ crlfDelay: Infinity,
78
+ })
79
+
80
+ for await (const line of rl) {
81
+ lineNumber++
82
+ const corruption = scanLine(line, lineNumber)
83
+ if (corruption) {
84
+ corruptions.push(corruption)
85
+ }
86
+ }
87
+
88
+ return corruptions
89
+ }
90
+
91
+ /**
92
+ * Scans an NDJSON file for UTF-8 corruption
93
+ *
94
+ * @param filePath - Path to the ndjson file
95
+ * @returns Scan result with corruption information
96
+ * @public
97
+ */
98
+ export async function scanNdjsonFile(filePath: string): Promise<ScanResult> {
99
+ const stream = createReadStream(filePath, {encoding: 'utf8'})
100
+ const corruptions = await scanStream(stream)
101
+
102
+ const files = new Map<string, CorruptionInfo[]>()
103
+ if (corruptions.length > 0) {
104
+ files.set(filePath, corruptions)
105
+ }
106
+
107
+ return {
108
+ corrupted: corruptions.length > 0,
109
+ files,
110
+ totalCorruptedLines: corruptions.length,
111
+ scannedFiles: [filePath],
112
+ }
113
+ }
114
+
115
+ /**
116
+ * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
117
+ *
118
+ * @param filePath - Path to the tar.gz file
119
+ * @returns Scan result with corruption information
120
+ * @public
121
+ */
122
+ export async function scanTarGz(filePath: string): Promise<ScanResult> {
123
+ const extract = tarStream.extract()
124
+
125
+ const results = new Map<string, CorruptionInfo[]>()
126
+ const scannedFiles: string[] = []
127
+ const targetFiles = ['data.ndjson', 'asset.json']
128
+
129
+ return new Promise((resolve, reject) => {
130
+ extract.on('entry', (header, stream, next) => {
131
+ const fileBasename = basename(header.name)
132
+
133
+ if (targetFiles.includes(fileBasename)) {
134
+ scannedFiles.push(header.name)
135
+ const chunks: Buffer[] = []
136
+
137
+ stream.on('data', (chunk: Buffer) => {
138
+ chunks.push(chunk)
139
+ })
140
+
141
+ stream.on('end', () => {
142
+ // Combine all chunks and convert to string
143
+ const content = Buffer.concat(chunks).toString('utf8')
144
+ const lines = content.split(/\r?\n/)
145
+ const corruptions: CorruptionInfo[] = []
146
+
147
+ for (let i = 0; i < lines.length; i++) {
148
+ const line = lines[i]
149
+ if (line !== undefined && line.length > 0) {
150
+ const corruption = scanLine(line, i + 1)
151
+ if (corruption) {
152
+ corruptions.push(corruption)
153
+ }
154
+ }
155
+ }
156
+
157
+ if (corruptions.length > 0) {
158
+ results.set(header.name, corruptions)
159
+ }
160
+ next()
161
+ })
162
+
163
+ stream.on('error', reject)
164
+ } else {
165
+ // Skip this entry
166
+ stream.on('end', next)
167
+ stream.resume()
168
+ }
169
+ })
170
+
171
+ extract.on('finish', () => {
172
+ let totalCorruptedLines = 0
173
+ for (const corruptions of results.values()) {
174
+ totalCorruptedLines += corruptions.length
175
+ }
176
+
177
+ resolve({
178
+ corrupted: results.size > 0,
179
+ files: results,
180
+ totalCorruptedLines,
181
+ scannedFiles,
182
+ })
183
+ })
184
+
185
+ extract.on('error', reject)
186
+
187
+ const gunzip = createGunzip()
188
+ gunzip.on('error', reject)
189
+
190
+ createReadStream(filePath).pipe(gunzip).pipe(extract)
191
+ })
192
+ }
193
+
194
+ /**
195
+ * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
196
+ *
197
+ * @param dirPath - Path to the directory
198
+ * @returns Scan result with corruption information
199
+ * @public
200
+ */
201
+ export async function scanDirectory(dirPath: string): Promise<ScanResult> {
202
+ const targetFiles = ['data.ndjson', 'assets.json']
203
+ const foundFiles: string[] = []
204
+
205
+ for (const filename of targetFiles) {
206
+ const filePath = join(dirPath, filename)
207
+ if (existsSync(filePath)) {
208
+ foundFiles.push(filePath)
209
+ }
210
+ }
211
+
212
+ if (foundFiles.length === 0) {
213
+ throw new Error(
214
+ `No data.ndjson or assets.json found in directory: ${dirPath}`,
215
+ )
216
+ }
217
+
218
+ const results = new Map<string, CorruptionInfo[]>()
219
+ const scannedFiles: string[] = []
220
+ let totalCorruptedLines = 0
221
+
222
+ for (const filePath of foundFiles) {
223
+ const result = await scanNdjsonFile(filePath)
224
+ scannedFiles.push(...result.scannedFiles)
225
+ for (const [file, corruptions] of result.files) {
226
+ results.set(file, corruptions)
227
+ totalCorruptedLines += corruptions.length
228
+ }
229
+ }
230
+
231
+ return {
232
+ corrupted: results.size > 0,
233
+ files: results,
234
+ totalCorruptedLines,
235
+ scannedFiles,
236
+ }
237
+ }
238
+
239
+ /**
240
+ * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
241
+ *
242
+ * The corruption manifests as U+FFFD replacement characters appearing
243
+ * where valid multi-byte characters (CJK, emoji, etc.) should be.
244
+ *
245
+ * @param filePath - Path to the file or directory to scan
246
+ * @returns Scan result with corruption information
247
+ * @public
248
+ */
249
+ export async function detectCorruption(filePath: string): Promise<ScanResult> {
250
+ const stat = statSync(filePath)
251
+
252
+ if (stat.isDirectory()) {
253
+ return scanDirectory(filePath)
254
+ }
255
+
256
+ const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz')
257
+ return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath)
258
+ }