@lde/sparql-qlever 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"importer.d.ts","sourceRoot":"","sources":["../src/importer.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,QAAQ,IAAI,iBAAiB,EAC7B,eAAe,EACf,YAAY,EACZ,gBAAgB,EAChB,YAAY,EACb,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAK5C,MAAM,WAAW,kBAAkB;IACjC,oBAAoB;IACpB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,yBAAyB;IACzB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,uEAAuE;IACvE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,oBAAoB;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,yJAAyJ;IACzJ,+BAA+B,CAAC,EAAE,OAAO,CAAC;CAC3C;AAED,MAAM,WAAW,qBAAsB,SAAQ,eAAe;IAC5D,sBAAsB;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,kBAAkB,CAAC;CACpC;AAWD,qBAAa,QAAS,YAAW,iBAAiB;IAChD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkB;gBAE9B,OAAO,EAAE,qBAAqB;IAa7B,MAAM,CACjB,aAAa,EAAE,YAAY,EAAE,GAC5B,OAAO,CAAC,YAAY,GAAG,gBAAgB,GAAG,YAAY,CAAC;YA+B5C,QAAQ;IAkDtB,OAAO,CAAC,gBAAgB;IAQxB,OAAO,CAAC,aAAa;IAIrB;;OAEG;YACW,eAAe;IA0B7B,yDAAyD;YAC3C,eAAe;YAaf,cAAc;YAKd,KAAK;CA0CpB"}
1
+ {"version":3,"file":"importer.d.ts","sourceRoot":"","sources":["../src/importer.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,QAAQ,IAAI,iBAAiB,EAC7B,eAAe,EACf,YAAY,EACZ,gBAAgB,EAChB,YAAY,EACb,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAyB,YAAY,EAAE,MAAM,cAAc,CAAC;AAMnE,MAAM,WAAW,kBAAkB;IACjC,oBAAoB;IACpB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,yBAAyB;IACzB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,uEAAuE;IACvE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,oBAAoB;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,yJAAyJ;IACzJ,+BAA+B,CAAC,EAAE,OAAO,CAAC;CAC3C;AAED,MAAM,WAAW,qBAAsB,SAAQ,eAAe;IAC5D,sBAAsB;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,kBAAkB,CAAC;CACpC;AAWD,qBAAa,QAAS,YAAW,iBAAiB;IAChD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkB;gBAE9B,OAAO,EAAE,qBAAqB;IAa7B,MAAM,CACjB,aAAa,EAAE,YAAY,EAAE,GAC5B,OAAO,CAAC,YAAY,GAAG,gBAAgB,GAAG,YAAY,CAAC;YAqC5C,QAAQ;IA6DtB,OAAO,CAAC,gBAAgB;IAQxB,OAAO,CAAC,aAAa;IAIrB;;OAEG;YACW,eAAe;IA0B7B,yDAAyD;YAC3C,eAAe;YAaf,cAAc;YAKd,KAAK;CA4CpB"}
package/dist/importer.js CHANGED
@@ -1,7 +1,9 @@
1
1
  import { ImportFailed, ImportSuccessful, NotSupported, } from '@lde/sparql-importer';
2
+ import { compressionMediaTypes } from '@lde/dataset';
2
3
  import { LastModifiedDownloader } from '@lde/distribution-downloader';
3
4
  import { basename, dirname, join } from 'path';
4
5
  import { readFile, stat, writeFile } from 'node:fs/promises';
6
+ import { needsPreprocessing, preprocess } from './preprocess.js';
5
7
  export class Importer {
6
8
  options;
7
9
  constructor(options) {
@@ -17,8 +19,11 @@ export class Importer {
17
19
  };
18
20
  }
19
21
  async import(distributions) {
20
- const downloadDistributions = distributions.filter((distribution) => distribution.mimeType !== undefined &&
21
- supportedFormats.has(distribution.mimeType));
22
+ const downloadDistributions = distributions
23
+ .filter((distribution) => distribution.mimeType !== undefined &&
24
+ acceptedMediaTypes.includes(distribution.mimeType))
25
+ .sort((a, b) => acceptedMediaTypes.indexOf(a.mimeType) -
26
+ acceptedMediaTypes.indexOf(b.mimeType));
22
27
  if (downloadDistributions.length === 0) {
23
28
  return new NotSupported();
24
29
  }
@@ -52,15 +57,29 @@ export class Importer {
52
57
  }
53
58
  return new ImportSuccessful(distribution, undefined, tripleCount);
54
59
  }
55
- const { format, warning } = fileFormatFor(distribution.mimeType, basename(localFile), headers.get('Content-Type') ?? undefined);
60
+ let indexFile = localFile;
61
+ let format;
62
+ const warnings = [];
63
+ if (needsPreprocessing(distribution)) {
64
+ const result = await preprocess(localFile, distribution);
65
+ indexFile = result.path;
66
+ format = result.format;
67
+ warnings.push(...result.warnings);
68
+ }
69
+ else {
70
+ const resolved = fileFormatFor(distribution.mimeType, basename(localFile), headers.get('Content-Type') ?? undefined);
71
+ format = resolved.format;
72
+ if (resolved.warning)
73
+ warnings.push(resolved.warning);
74
+ }
56
75
  let logs;
57
76
  try {
58
- logs = await this.index(localFile, format);
77
+ logs = await this.index(indexFile, format);
59
78
  }
60
79
  catch (error) {
61
80
  if (format === 'ttl' &&
62
81
  error.message?.includes('multiline string literal')) {
63
- logs = await this.index(localFile, format, false);
82
+ logs = await this.index(indexFile, format, false);
64
83
  }
65
84
  else {
66
85
  throw error;
@@ -71,7 +90,6 @@ export class Importer {
71
90
  return new ImportFailed(distribution, 'Indexed 0 triples from distribution');
72
91
  }
73
92
  await this.writeCacheInfo(localFile);
74
- const warnings = warning ? [warning] : [];
75
93
  return new ImportSuccessful(distribution, undefined, tripleCount, warnings);
76
94
  }
77
95
  parseTripleCount(logs) {
@@ -148,15 +166,53 @@ export class Importer {
148
166
  .filter(Boolean)
149
167
  .join(' ');
150
168
  const metadataFile = `${this.options.indexName}.meta-data.json`;
151
- const indexTask = await this.options.taskRunner.run(`(gunzip -c '${basename(file)}' 2>/dev/null || cat '${basename(file)}') | qlever-index ${flags} && cat ${metadataFile}`);
169
+ const localName = basename(file);
170
+ const decompressCommand = localName.toLowerCase().endsWith('.zip')
171
+ ? `unzip -p ${shellQuote(localName)}`
172
+ : `(gunzip -c ${shellQuote(localName)} 2>/dev/null || cat ${shellQuote(localName)})`;
173
+ const indexTask = await this.options.taskRunner.run(`${decompressCommand} | qlever-index ${flags} && cat ${shellQuote(metadataFile)}`);
152
174
  return await this.options.taskRunner.wait(indexTask);
153
175
  }
154
176
  }
155
- const supportedFormats = new Map([
177
+ /**
178
+ * POSIX-quote a value for safe interpolation into a shell command: wrap it in
179
+ * single quotes and escape any embedded single quote as `'\''`.
180
+ *
181
+ * Without this, a data filename containing an apostrophe — e.g. a dataset
182
+ * titled `'s-Hertogenbosch`, whose distribution URL maps to a local file like
183
+ * `…Erfgoed+'s-Hertogenbosch.nt` — would terminate the surrounding quotes, so
184
+ * `cat`/`gunzip` would read a non-existent path and feed `qlever-index` empty
185
+ * input. The index then "succeeds" with 0 triples, the import is treated as
186
+ * failed, and every distribution (and the JSON-LD fallback) fails the same way.
187
+ */
188
+ function shellQuote(value) {
189
+ return `'${value.replace(/'/g, "'\\''")}'`;
190
+ }
191
+ /**
192
+ * Native QLever index formats — `qlever-index -F <flag>` consumes these
193
+ * directly. JSON-LD is not here: it is preprocessed to N-Quads first (see
194
+ * {@link preprocess}).
195
+ */
196
+ const nativeFormats = new Map([
156
197
  ['application/n-triples', 'nt'],
157
198
  ['application/n-quads', 'nq'],
158
199
  ['text/turtle', 'ttl'],
159
200
  ]);
201
+ /**
202
+ * Accepted distribution media types, in preference order: the first match is
203
+ * tried first. Native formats win over JSON-LD because they skip the Node-side
204
+ * preprocessor.
205
+ *
206
+ * `application/zip` is intentionally absent — the inner RDF format must be
207
+ * declared via `mediaType` with `application/zip` appearing only as the
208
+ * `compressFormat`, so we know what is inside.
209
+ */
210
+ const acceptedMediaTypes = [
211
+ 'application/n-quads',
212
+ 'application/n-triples',
213
+ 'text/turtle',
214
+ 'application/ld+json',
215
+ ];
160
216
  const defaultQleverIndexOptions = {
161
217
  'ascii-prefixes-only': true,
162
218
  'num-triples-per-batch': 3_000_000,
@@ -169,11 +225,6 @@ const extensionFormats = new Map([
169
225
  ['.nq', 'nq'],
170
226
  ['.ttl', 'ttl'],
171
227
  ]);
172
- const compressionTypes = new Set([
173
- 'application/gzip',
174
- 'application/x-gzip',
175
- 'application/octet-stream',
176
- ]);
177
228
  /**
178
229
  * Determine the QLever format flag for a distribution.
179
230
  *
@@ -183,15 +234,15 @@ const compressionTypes = new Set([
183
234
  * 3. Declared MIME type from the dataset registry (last resort)
184
235
  */
185
236
  function fileFormatFor(declaredMimeType, filename, serverContentType) {
186
- const declaredFormat = supportedFormats.get(declaredMimeType);
237
+ const declaredFormat = nativeFormats.get(declaredMimeType);
187
238
  if (declaredFormat === undefined) {
188
239
  throw new Error(`Unsupported media type: ${declaredMimeType}`);
189
240
  }
190
241
  // Try server Content-Type first (strip parameters like "; charset=utf-8").
191
242
  if (serverContentType) {
192
243
  const actualType = serverContentType.split(';')[0].trim();
193
- if (!compressionTypes.has(actualType)) {
194
- const serverFormat = supportedFormats.get(actualType);
244
+ if (!compressionMediaTypes.has(actualType)) {
245
+ const serverFormat = nativeFormats.get(actualType);
195
246
  if (serverFormat !== undefined && serverFormat !== declaredFormat) {
196
247
  return {
197
248
  format: serverFormat,
@@ -0,0 +1,33 @@
1
+ import { Distribution } from '@lde/dataset';
2
+ export interface PreprocessResult {
3
+ /** Path to the file ready for `qlever-index`. Always N-Quads. */
4
+ path: string;
5
+ format: 'nq';
6
+ warnings: string[];
7
+ }
8
+ /**
9
+ * Whether a distribution needs Node-side preprocessing before `qlever-index`
10
+ * can read it.
11
+ *
12
+ * Only JSON-LD distributions return `true`: `qlever-index` cannot parse
13
+ * JSON-LD, so we stream it through a JSON-LD parser into N-Quads first.
14
+ *
15
+ * Native RDF formats (`nt`, `nq`, `ttl`) — including when wrapped in
16
+ * `application/gzip` or `application/zip` — go straight through the shell
17
+ * pipeline in `index()`, which uses `gunzip -c` or `unzip -p` as appropriate.
18
+ * Standalone `mediaType=application/zip` is rejected upstream: the inner
19
+ * format must be declared.
20
+ */
21
+ export declare function needsPreprocessing(distribution: Distribution): boolean;
22
+ /**
23
+ * Convert a JSON-LD distribution to N-Quads alongside the source file.
24
+ *
25
+ * Streams the source through `rdf-parse` → `rdf-serialize` so memory use
26
+ * stays bounded regardless of input size. Handles gzip transparently
27
+ * (declared `compressFormat` or `.gz` filename) and zip containers (folds
28
+ * each JSON-LD entry into the output stream in order).
29
+ *
30
+ * Cached: if the output is newer than the input, it is reused as-is.
31
+ */
32
+ export declare function preprocess(localFile: string, distribution: Distribution): Promise<PreprocessResult>;
33
+ //# sourceMappingURL=preprocess.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"preprocess.d.ts","sourceRoot":"","sources":["../src/preprocess.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAmB5C,MAAM,WAAW,gBAAgB;IAC/B,iEAAiE;IACjE,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,IAAI,CAAC;IACb,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,kBAAkB,CAAC,YAAY,EAAE,YAAY,GAAG,OAAO,CAEtE;AAED;;;;;;;;;GASG;AACH,wBAAsB,UAAU,CAC9B,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,YAAY,GACzB,OAAO,CAAC,gBAAgB,CAAC,CAsB3B"}
@@ -0,0 +1,158 @@
1
+ import { rdfParser } from 'rdf-parse';
2
+ import { rdfSerializer } from 'rdf-serialize';
3
+ import { createGunzip } from 'node:zlib';
4
+ import { createReadStream, createWriteStream } from 'node:fs';
5
+ import { rm, stat } from 'node:fs/promises';
6
+ import { extname } from 'node:path';
7
+ import { finished } from 'node:stream/promises';
8
+ import { promisify } from 'node:util';
9
+ import yauzl from 'yauzl';
10
+ const JSONLD_MIME = 'application/ld+json';
11
+ const ZIP_MIME = 'application/zip';
12
+ const GZIP_MIME = 'application/gzip';
13
+ const GZIP_MIME_LEGACY = 'application/x-gzip';
14
+ const JSONLD_ZIP_EXTENSIONS = ['.jsonld', '.json'];
15
+ /**
16
+ * Whether a distribution needs Node-side preprocessing before `qlever-index`
17
+ * can read it.
18
+ *
19
+ * Only JSON-LD distributions return `true`: `qlever-index` cannot parse
20
+ * JSON-LD, so we stream it through a JSON-LD parser into N-Quads first.
21
+ *
22
+ * Native RDF formats (`nt`, `nq`, `ttl`) — including when wrapped in
23
+ * `application/gzip` or `application/zip` — go straight through the shell
24
+ * pipeline in `index()`, which uses `gunzip -c` or `unzip -p` as appropriate.
25
+ * Standalone `mediaType=application/zip` is rejected upstream: the inner
26
+ * format must be declared.
27
+ */
28
+ export function needsPreprocessing(distribution) {
29
+ return distribution.mimeType === JSONLD_MIME;
30
+ }
31
+ /**
32
+ * Convert a JSON-LD distribution to N-Quads alongside the source file.
33
+ *
34
+ * Streams the source through `rdf-parse` → `rdf-serialize` so memory use
35
+ * stays bounded regardless of input size. Handles gzip transparently
36
+ * (declared `compressFormat` or `.gz` filename) and zip containers (folds
37
+ * each JSON-LD entry into the output stream in order).
38
+ *
39
+ * Cached: if the output is newer than the input, it is reused as-is.
40
+ */
41
+ export async function preprocess(localFile, distribution) {
42
+ if (!needsPreprocessing(distribution)) {
43
+ throw new Error(`preprocess called for distribution that does not need preprocessing: mediaType=${distribution.mimeType}`);
44
+ }
45
+ const outputFile = `${localFile}.preprocessed.nq`;
46
+ if (await outputIsUpToDate(localFile, outputFile)) {
47
+ return { path: outputFile, format: 'nq', warnings: [] };
48
+ }
49
+ await rm(outputFile, { force: true });
50
+ const warnings = [];
51
+ if (distribution.compressMimeType === ZIP_MIME) {
52
+ await streamJsonldZip(localFile, outputFile, warnings);
53
+ }
54
+ else {
55
+ await streamJsonldFile(localFile, outputFile, distribution);
56
+ }
57
+ return { path: outputFile, format: 'nq', warnings };
58
+ }
59
+ async function outputIsUpToDate(inputFile, outputFile) {
60
+ try {
61
+ const [inputStat, outputStat] = await Promise.all([
62
+ stat(inputFile),
63
+ stat(outputFile),
64
+ ]);
65
+ return outputStat.mtimeMs > inputStat.mtimeMs && outputStat.size > 0;
66
+ }
67
+ catch {
68
+ return false;
69
+ }
70
+ }
71
+ /**
72
+ * Pipe one JSON-LD source through parse → N-Quads serialize into an already
73
+ * open writable, without closing it. Back-pressure is handled by Node's
74
+ * built-in `.pipe()`; the caller manages `output`'s lifecycle.
75
+ */
76
+ async function pipeJsonldToWritable(input, output) {
77
+ const quads = rdfParser.parse(input, { contentType: JSONLD_MIME });
78
+ const bytes = rdfSerializer.serialize(quads, {
79
+ contentType: 'application/n-quads',
80
+ });
81
+ bytes.pipe(output, { end: false });
82
+ await finished(bytes);
83
+ }
84
+ async function closeWritable(output) {
85
+ await new Promise((resolve, reject) => {
86
+ output.once('close', resolve);
87
+ output.once('error', reject);
88
+ output.end();
89
+ });
90
+ }
91
+ async function streamJsonldFile(localFile, outputFile, distribution) {
92
+ const isGzipped = distribution.compressMimeType === GZIP_MIME ||
93
+ distribution.compressMimeType === GZIP_MIME_LEGACY ||
94
+ localFile.toLowerCase().endsWith('.gz');
95
+ const source = createReadStream(localFile);
96
+ const input = isGzipped ? source.pipe(createGunzip()) : source;
97
+ const output = createWriteStream(outputFile);
98
+ try {
99
+ await pipeJsonldToWritable(input, output);
100
+ }
101
+ finally {
102
+ await closeWritable(output);
103
+ }
104
+ }
105
+ const openZip = promisify(yauzl.open);
106
+ async function streamJsonldZip(zipFile, outputFile, warnings) {
107
+ const zip = await openZip(zipFile, { lazyEntries: true });
108
+ const output = createWriteStream(outputFile);
109
+ let entriesProcessed = 0;
110
+ try {
111
+ await new Promise((resolve, reject) => {
112
+ zip.on('error', reject);
113
+ zip.on('end', resolve);
114
+ zip.on('entry', (entry) => {
115
+ void (async () => {
116
+ try {
117
+ if (entry.fileName.endsWith('/')) {
118
+ zip.readEntry();
119
+ return;
120
+ }
121
+ const extension = extname(entry.fileName).toLowerCase();
122
+ if (!JSONLD_ZIP_EXTENSIONS.includes(extension)) {
123
+ warnings.push(`Skipping zip entry ${entry.fileName}: extension ${extension || '(none)'} is not JSON-LD`);
124
+ zip.readEntry();
125
+ return;
126
+ }
127
+ const stream = await openZipEntry(zip, entry);
128
+ await pipeJsonldToWritable(stream, output);
129
+ entriesProcessed++;
130
+ zip.readEntry();
131
+ }
132
+ catch (error) {
133
+ reject(error);
134
+ }
135
+ })();
136
+ });
137
+ zip.readEntry();
138
+ });
139
+ }
140
+ finally {
141
+ zip.close();
142
+ await closeWritable(output);
143
+ }
144
+ if (entriesProcessed === 0) {
145
+ throw new Error(`Zip ${zipFile} contains no JSON-LD entries`);
146
+ }
147
+ }
148
+ function openZipEntry(zip, entry) {
149
+ return new Promise((resolve, reject) => {
150
+ zip.openReadStream(entry, (error, stream) => {
151
+ if (error || stream === undefined) {
152
+ reject(error ?? new Error('Failed to open zip entry'));
153
+ return;
154
+ }
155
+ resolve(stream);
156
+ });
157
+ });
158
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lde/sparql-qlever",
3
- "version": "0.14.3",
3
+ "version": "0.14.5",
4
4
  "repository": {
5
5
  "url": "git+https://github.com/ldelements/lde.git",
6
6
  "directory": "packages/sparql-qlever"
@@ -24,14 +24,21 @@
24
24
  "!**/*.tsbuildinfo"
25
25
  ],
26
26
  "dependencies": {
27
- "@lde/dataset": "0.7.3",
28
- "@lde/distribution-downloader": "0.6.1",
29
- "@lde/sparql-importer": "0.6.1",
27
+ "@lde/dataset": "0.7.4",
28
+ "@lde/distribution-downloader": "0.6.2",
29
+ "@lde/sparql-importer": "0.6.2",
30
30
  "@lde/sparql-server": "0.4.11",
31
31
  "@lde/task-runner": "0.2.11",
32
32
  "@lde/task-runner-docker": "0.2.13",
33
33
  "@lde/task-runner-native": "0.2.14",
34
34
  "@lde/wait-for-sparql": "0.2.12",
35
- "tslib": "^2.3.0"
35
+ "rdf-parse": "^5.0.0",
36
+ "rdf-serialize": "^5.1.0",
37
+ "tslib": "^2.3.0",
38
+ "yauzl": "^3.3.1"
39
+ },
40
+ "devDependencies": {
41
+ "@rdfjs/types": "^2.0.0",
42
+ "@types/yauzl": "^2.10.3"
36
43
  }
37
44
  }