marcattacks 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ (
2
+ /******start helper functions****/
3
+ $strip := function ($str) {
4
+ $replace($str,/\s*[\,.:\/]$/,"")
5
+ };
6
+
7
+ $marcmap0 := function ($path) {
8
+ $marcmap($path)[0]
9
+ };
10
+
11
+ $typeLookupTable := {
12
+ "_": "schema:CreativeWork", /* default */
13
+ "book": "schema:Book",
14
+ "catalog": "schema:Book",
15
+ "correspondence": "schema:CreativeWork",
16
+ "dissertation": "schema:Thesis",
17
+ "ephemera": "schema:CreativeWork",
18
+ "image": "schema:VisualWork",
19
+ "manuscript": "schema:Book",
20
+ "map": "schema:Map",
21
+ "master": "schema:Thesis",
22
+ "periodical": "schema:Periodical",
23
+ "phd": "schema:Thesis"
24
+ };
25
+
26
+ $typeLookup := function ($val) {(
27
+ $v := $lookup($typeLookupTable,$val);
28
+ $v ? $v : $typeLookupTable._
29
+ )};
30
+ /******end helper functions******/
31
+
32
+ {
33
+ "@id": "my:" & $marcmap0("001"),
34
+ "@type": $typeLookup($marcmap0("920a")),
35
+ "name": $marcmap("245ab") ~> $join(" ") ~> $strip() ,
36
+ "subject": $marcmap("500a") ~> $map(function ($val) {
37
+ {
38
+ "@id": $genid(),
39
+ "@type": "schema:Subject",
40
+ "name": $val
41
+ }
42
+ })
43
+ }
44
+ )
package/dist/index.js ADDED
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env node
2
+ import log4js from 'log4js';
3
+ import { program } from 'commander';
4
+ import { loadPlugin } from './plugin-loader.js';
5
+ import { sftpReadStream, sftpWriteStream, sftpLatestFile } from './sftpstream.js';
6
+ import { httpReadStream } from './httpstream.js';
7
+ import { Readable } from 'stream';
8
+ import { pathToFileURL } from "node:url";
9
+ import { SlowWritable } from './slow-writable.js';
10
+ import path from "node:path";
11
+ import fs from 'fs';
12
+ import { s3ReaderStream, s3WriterStream } from './s3stream.js';
13
+ log4js.configure({
14
+ appenders: {
15
+ err: {
16
+ type: "stderr",
17
+ layout: {
18
+ type: "pattern",
19
+ pattern: "%[%d %p %f{1} %m%]"
20
+ }
21
+ }
22
+ },
23
+ categories: {
24
+ default: { appenders: ["err"], level: "off", enableCallStack: true }
25
+ }
26
+ });
27
+ program.version('0.1.0')
28
+ .argument('<file>')
29
+ .option('-f,--from <from>', 'input type', 'xml')
30
+ .option('-t,--to <output>', 'output type', 'json')
31
+ .option('-m,--map <map>', 'data mapper', 'json')
32
+ .option('--fix <what>', 'jsonata')
33
+ .option('-o,--out <file>', 'output file')
34
+ .option('--key <keyfile>', 'private key file')
35
+ .option('--info', 'output debugging messages')
36
+ .option('--debug', 'output more debugging messages')
37
+ .option('--trace', 'output much more debugging messages');
38
+ program.parse(process.argv);
39
+ const opts = program.opts();
40
+ const logger = log4js.getLogger();
41
+ if (opts.info) {
42
+ logger.level = "info";
43
+ }
44
+ if (opts.debug) {
45
+ logger.level = "debug";
46
+ }
47
+ if (opts.trace) {
48
+ logger.level = "trace";
49
+ }
50
+ main();
51
+ async function main() {
52
+ const url = program.args[0];
53
+ if (!url) {
54
+ console.error(`need an input file`);
55
+ process.exit(2);
56
+ }
57
+ let inputFile;
58
+ if (fs.existsSync(url)) {
59
+ const filePath = path.resolve(process.cwd(), url);
60
+ inputFile = pathToFileURL(filePath);
61
+ }
62
+ else {
63
+ inputFile = new URL(url);
64
+ }
65
+ logger.info(`using: ${inputFile}`);
66
+ let readableStream;
67
+ if (inputFile.protocol.startsWith("http")) {
68
+ readableStream = await httpReadStream(inputFile.toString());
69
+ }
70
+ else if (inputFile.protocol.startsWith("s3")) {
71
+ readableStream = await s3ReaderStream(inputFile, {});
72
+ }
73
+ else if (inputFile.protocol === 'sftp:') {
74
+ const config = makeSftpConfig(inputFile, opts);
75
+ let remotePath;
76
+ if (inputFile.pathname.match(/\/@latest:\w+$/)) {
77
+ const remoteDir = inputFile.pathname.replace(/\/@latest.*/, "");
78
+ const extension = inputFile.pathname.replace(/.*\/@latest:/, "");
79
+ remotePath = await sftpLatestFile(config, remoteDir, extension);
80
+ }
81
+ else {
82
+ remotePath = inputFile.pathname;
83
+ }
84
+ readableStream = await sftpReadStream(remotePath, config);
85
+ }
86
+ else {
87
+ readableStream = fs.createReadStream(inputFile);
88
+ }
89
+ let objectStream;
90
+ if (opts.from) {
91
+ const mod = await loadPlugin(opts.from, 'input');
92
+ objectStream = mod.stream2readable(readableStream);
93
+ }
94
+ else {
95
+ console.error(`Need --from`);
96
+ process.exit(1);
97
+ }
98
+ let resultStream = objectStream;
99
+ if (opts.map) {
100
+ const mod = await loadPlugin(opts.map, 'transform');
101
+ const transformer = await mod.transform(opts.fix);
102
+ resultStream = objectStream.pipe(transformer);
103
+ }
104
+ let outStream;
105
+ if (opts.out === '@slow') {
106
+ outStream = new SlowWritable({ delayMs: 100 });
107
+ }
108
+ else if (opts.out) {
109
+ if (opts.out.startsWith("sftp")) {
110
+ const url = new URL(opts.out);
111
+ const config = makeSftpConfig(url, opts);
112
+ logger.info(`put ${url}`);
113
+ outStream = await sftpWriteStream(url.href, config);
114
+ }
115
+ else if (opts.out.startsWith("s3")) {
116
+ const url = new URL(opts.out);
117
+ logger.info(`put ${url}`);
118
+ outStream = await s3WriterStream(url, {});
119
+ }
120
+ else {
121
+ outStream = fs.createWriteStream(opts.out, { encoding: 'utf-8' });
122
+ }
123
+ }
124
+ else {
125
+ outStream = process.stdout;
126
+ }
127
+ if (opts.to) {
128
+ const mod = await loadPlugin(opts.to, 'output');
129
+ mod.readable2writable(resultStream, outStream);
130
+ }
131
+ }
132
+ function makeSftpConfig(inputFile, opts) {
133
+ let privateKey = undefined;
134
+ if (opts.key) {
135
+ privateKey = fs.readFileSync(opts.key, { encoding: 'utf-8' });
136
+ }
137
+ let config = {
138
+ host: inputFile.hostname,
139
+ port: Number(inputFile.port) ?? 22,
140
+ username: inputFile.username
141
+ };
142
+ if (inputFile.password) {
143
+ config.password = inputFile.password;
144
+ }
145
+ if (privateKey) {
146
+ config.privateKey = privateKey;
147
+ }
148
+ return config;
149
+ }
150
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,37 @@
1
+ services:
2
+ minio:
3
+ image: minio/minio:RELEASE.2025-04-22T22-12-26Z-cpuv1
4
+ command: server /data -address ":3371" --console-address ":3372"
5
+ environment:
6
+ MINIO_ROOT_USER: minioadmin
7
+ MINIO_ROOT_PASSWORD: minioadmin
8
+ volumes:
9
+ - minio-data:/data
10
+ ports:
11
+ - 3371:3371
12
+ - 3372:3372
13
+ healthcheck:
14
+ test: ["CMD", "curl", "-f", "http://localhost:3371/minio/health/live"]
15
+ interval: 10s
16
+ timeout: 5s
17
+ retries: 5
18
+ start_period: 10s
19
+
20
+ mc:
21
+ image: minio/mc
22
+ depends_on:
23
+ - minio
24
+ # minio:
25
+ # condition: service_healthy
26
+ restart: on-failure
27
+ entrypoint:
28
+ - sh
29
+ - -c
30
+ - |
31
+ sleep 2;
32
+ mc alias set docker http://minio:3371 minioadmin minioadmin;
33
+ mc mb docker/bbl --ignore-existing;
34
+
35
+ volumes:
36
+ minio-data:
37
+ driver: local
package/logo.jpg ADDED
Binary file
package/package.json ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "name": "marcattacks",
3
+ "version": "1.0.0",
4
+ "main": "index.js",
5
+ "type": "module",
6
+ "author": "",
7
+ "repository": "https://codeberg.org/phochste/marcattacks.git",
8
+ "bin": {
9
+ "marcattacks": "./dist/index.js"
10
+ },
11
+ "scripts": {
12
+ "test": "echo \"Error: no test specified\" && exit 1",
13
+ "build:ts": "npx tsc",
14
+ "build:watch": "npx tsc -w",
15
+ "docker:build": "docker build . -t hochstenbach/marcattacks:v0.0.1",
16
+ "docker:run": "docker run --rm -v `pwd`/data:/app/data -it hochstenbach/marcattacks:v0.0.1 --to rdf --map rdf data/sample.xml",
17
+ "docker:push": "docker push hochstenbach/marcattacks:v0.0.1"
18
+ },
19
+ "keywords": [],
20
+ "license": "MIT",
21
+ "description": "",
22
+ "devDependencies": {
23
+ "@types/n3": "^1.26.1",
24
+ "@types/node": "^24.10.1",
25
+ "@types/sax": "^1.2.7",
26
+ "@types/ssh2-sftp-client": "^9.0.5",
27
+ "@types/stream-json": "^1.7.8",
28
+ "eslint": "^9.39.1",
29
+ "prettier": "^3.6.2",
30
+ "ts-node": "^10.9.2",
31
+ "typescript": "^5.9.3"
32
+ },
33
+ "dependencies": {
34
+ "@aws-sdk/client-s3": "^3.940.0",
35
+ "commander": "^14.0.2",
36
+ "fast-xml-parser": "^5.3.2",
37
+ "jsonata": "^2.1.0",
38
+ "log4js": "^6.9.1",
39
+ "n3": "^1.26.0",
40
+ "sax": "^1.4.3",
41
+ "ssh2-sftp-client": "^12.0.1",
42
+ "stream-chain": "^3.4.0",
43
+ "stream-json": "^1.9.1",
44
+ "uuid": "^13.0.0"
45
+ }
46
+ }
package/plugin/demo.js ADDED
@@ -0,0 +1,12 @@
1
+ import { Transform } from 'stream';
2
+
3
+ export function transform(opts) {
4
+ return new Transform({
5
+ objectMode: true,
6
+ transform(data, encoding, callback) {
7
+ data['id'] = "brol";
8
+ data['record'] = [];
9
+ callback(null,data);
10
+ }
11
+ });
12
+ }
@@ -0,0 +1,28 @@
1
+ import { Readable } from 'stream';
2
+ import * as http from 'http';
3
+ import * as https from 'https';
4
+ import { URL } from 'url';
5
+
6
+ export function httpReadStream(urlString: string): Promise<Readable> {
7
+ return new Promise((resolve, reject) => {
8
+ const url = new URL(urlString);
9
+ const client = url.protocol === 'http:' ? http : https;
10
+
11
+ const req = client.get(url, res => {
12
+ if (res.statusCode && res.statusCode >= 400) {
13
+ reject(new Error('HTTP ' + res.statusCode));
14
+ return;
15
+ }
16
+
17
+ // Follow redirects
18
+ if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
19
+ httpReadStream(res.headers.location).then(resolve).catch(reject);
20
+ return;
21
+ }
22
+
23
+ resolve(res);
24
+ });
25
+
26
+ req.on('error', reject);
27
+ });
28
+ }
package/src/index.ts ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env node
2
+
3
+ import log4js from 'log4js';
4
+ import { program } from 'commander';
5
+ import { loadPlugin } from './plugin-loader.js';
6
+ import { sftpReadStream , sftpWriteStream , sftpLatestFile , type SftpConfig } from './sftpstream.js';
7
+ import { httpReadStream } from './httpstream.js';
8
+ import { Readable } from 'stream';
9
+ import { pathToFileURL } from "node:url";
10
+ import type { Transform, Writable } from 'node:stream';
11
+ import { SlowWritable } from './slow-writable.js';
12
+ import path from "node:path";
13
+ import fs from 'fs';
14
+ import { s3ReaderStream, s3WriterStream } from './s3stream.js';
15
+
16
+ log4js.configure({
17
+ appenders: {
18
+ err: {
19
+ type: "stderr" ,
20
+ layout: {
21
+ type: "pattern",
22
+ pattern: "%[%d %p %f{1} %m%]"
23
+ }
24
+ }
25
+ },
26
+ categories: {
27
+ default: { appenders: ["err"], level: "off" , enableCallStack: true }
28
+ }
29
+ });
30
+
31
+ program.version('0.1.0')
32
+ .argument('<file>')
33
+ .option('-f,--from <from>','input type','xml')
34
+ .option('-t,--to <output>','output type','json')
35
+ .option('-m,--map <map>','data mapper','json')
36
+ .option('--fix <what>','jsonata')
37
+ .option('-o,--out <file>','output file')
38
+ .option('--key <keyfile>', 'private key file')
39
+ .option('--info','output debugging messages')
40
+ .option('--debug','output more debugging messages')
41
+ .option('--trace','output much more debugging messages');
42
+
43
+ program.parse(process.argv);
44
+
45
+ const opts = program.opts();
46
+ const logger = log4js.getLogger();
47
+
48
+ if (opts.info) {
49
+ logger.level = "info";
50
+ }
51
+
52
+ if (opts.debug) {
53
+ logger.level = "debug";
54
+ }
55
+
56
+ if (opts.trace) {
57
+ logger.level = "trace";
58
+ }
59
+
60
+ main();
61
+
62
+ async function main() : Promise<void> {
63
+ const url = program.args[0];
64
+
65
+ if (! url) {
66
+ console.error(`need an input file`);
67
+ process.exit(2);
68
+ }
69
+
70
+ let inputFile : URL;
71
+
72
+ if (fs.existsSync(url)) {
73
+ const filePath = path.resolve(process.cwd(), url);
74
+ inputFile = pathToFileURL(filePath);
75
+ }
76
+ else {
77
+ inputFile = new URL(url);
78
+ }
79
+
80
+ logger.info(`using: ${inputFile}`);
81
+
82
+ let readableStream;
83
+
84
+ if (inputFile.protocol.startsWith("http")) {
85
+ readableStream = await httpReadStream(inputFile.toString());
86
+ }
87
+ else if (inputFile.protocol.startsWith("s3")) {
88
+ readableStream = await s3ReaderStream(inputFile,{});
89
+ }
90
+ else if (inputFile.protocol === 'sftp:') {
91
+ const config = makeSftpConfig(inputFile,opts);
92
+
93
+ let remotePath;
94
+
95
+ if (inputFile.pathname.match(/\/@latest:\w+$/)) {
96
+ const remoteDir = inputFile.pathname.replace(/\/@latest.*/,"");
97
+ const extension = inputFile.pathname.replace(/.*\/@latest:/,"");
98
+ remotePath = await sftpLatestFile(config,remoteDir,extension);
99
+ }
100
+ else {
101
+ remotePath = inputFile.pathname;
102
+ }
103
+
104
+ readableStream = await sftpReadStream(remotePath, config);
105
+ }
106
+ else {
107
+ readableStream = fs.createReadStream(inputFile);
108
+ }
109
+
110
+ let objectStream : Readable;
111
+
112
+ if (opts.from) {
113
+ const mod = await loadPlugin(opts.from,'input');
114
+ objectStream = mod.stream2readable(readableStream);
115
+ }
116
+ else {
117
+ console.error(`Need --from`);
118
+ process.exit(1);
119
+ }
120
+
121
+ let resultStream = objectStream;
122
+
123
+ if (opts.map) {
124
+ const mod = await loadPlugin(opts.map,'transform');
125
+ const transformer : Transform = await mod.transform(opts.fix);
126
+ resultStream = objectStream.pipe(transformer);
127
+ }
128
+
129
+ let outStream : Writable;
130
+
131
+ if (opts.out === '@slow') {
132
+ outStream = new SlowWritable({ delayMs: 100 });
133
+ }
134
+ else if (opts.out) {
135
+ if (opts.out.startsWith("sftp")) {
136
+ const url = new URL(opts.out);
137
+ const config = makeSftpConfig(url,opts);
138
+ logger.info(`put ${url}`);
139
+ outStream = await sftpWriteStream(url.href, config);
140
+ }
141
+ else if (opts.out.startsWith("s3")) {
142
+ const url = new URL(opts.out);
143
+ logger.info(`put ${url}`);
144
+ outStream = await s3WriterStream(url,{});
145
+ }
146
+ else {
147
+ outStream = fs.createWriteStream(opts.out, { encoding: 'utf-8'});
148
+ }
149
+ }
150
+ else {
151
+ outStream = process.stdout;
152
+ }
153
+
154
+ if (opts.to) {
155
+ const mod = await loadPlugin(opts.to,'output');
156
+ mod.readable2writable(resultStream, outStream);
157
+ }
158
+ }
159
+
160
+ function makeSftpConfig(inputFile: URL, opts: any) : SftpConfig {
161
+ let privateKey : string | undefined = undefined;
162
+
163
+ if (opts.key) {
164
+ privateKey = fs.readFileSync(opts.key,{ encoding: 'utf-8'});
165
+ }
166
+
167
+ let config: SftpConfig = {
168
+ host: inputFile.hostname,
169
+ port: Number(inputFile.port) ?? 22,
170
+ username: inputFile.username
171
+ };
172
+
173
+ if (inputFile.password) { config.password = inputFile.password }
174
+ if (privateKey) { config.privateKey = privateKey}
175
+
176
+ return config;
177
+ }
@@ -0,0 +1,83 @@
1
+ import { Readable } from "stream";
2
+ import * as readline from 'node:readline'
3
+ import log4js from 'log4js';
4
+
5
+ const logger = log4js.getLogger();
6
+
7
+ export function stream2readable(stream: Readable) : Readable {
8
+ let recordNum = 0;
9
+
10
+ const rl = readline.createInterface({input: stream, crlfDelay: Infinity});
11
+
12
+ let sourcePaused = false;
13
+
14
+ const readableStream = new Readable({
15
+ read() {
16
+ if (sourcePaused) {
17
+ logger.debug("backpressure off");
18
+ rl.resume();
19
+ sourcePaused = false;
20
+ }
21
+ } ,
22
+ objectMode: true
23
+ });
24
+
25
+ let rec : string[][] = [];
26
+ let previd : string = "";
27
+
28
+ rl.on('line', (line) => {
29
+ const [id,...rest] = line.split(" ");
30
+ const data = rest.join(" ");
31
+
32
+ if (previd && previd !== id) {
33
+ const ok = readableStream.push({
34
+ record: rec
35
+ });
36
+
37
+ if (!ok) {
38
+ logger.debug("backpressure on");
39
+ rl.pause();
40
+ sourcePaused = true;
41
+ }
42
+ rec = [];
43
+ recordNum++;
44
+
45
+ if (recordNum % 1000 === 0) {
46
+ logger.info(`record: ${recordNum}`);
47
+ }
48
+ }
49
+
50
+ const tag = data?.substring(0,3);
51
+ const ind1 = data?.substring(3,4);
52
+ const ind2 = data?.substring(4,5);
53
+ const sf = data?.substring(8);
54
+ const parts = sf.split(/\$\$(.)/);
55
+
56
+ if (tag == 'FMT' || tag === 'LDR' || tag.startsWith("00")) {
57
+ rec.push([
58
+ tag,ind1,ind2
59
+ ].concat(["_"].concat(parts)));
60
+ }
61
+ else {
62
+ rec.push([
63
+ tag,ind1,ind2
64
+ ].concat(parts.slice(1)));
65
+ }
66
+
67
+ previd = id!;
68
+ });
69
+
70
+ rl.on('close', () => {
71
+ readableStream.push({
72
+ record: rec
73
+ });
74
+ recordNum++;
75
+ if (recordNum % 1000 === 0) {
76
+ logger.info(`record: ${recordNum}`);
77
+ }
78
+ readableStream.push(null);
79
+ logger.info(`processed ${recordNum} records`);
80
+ });
81
+
82
+ return readableStream;
83
+ }
@@ -0,0 +1,47 @@
1
+ import { Readable } from "stream";
2
+ import streamArray from "stream-json/streamers/StreamArray.js";
3
+ import log4js from 'log4js';
4
+
5
+ const logger = log4js.getLogger();
6
+
7
+ export function stream2readable(stream: Readable) : Readable {
8
+ let recordNum = 0;
9
+
10
+ const pipeline = stream.pipe(streamArray.withParser());
11
+
12
+ let sourcePaused = false;
13
+
14
+ const readableStream = new Readable({
15
+ read() {
16
+ if (sourcePaused) {
17
+ logger.debug("backpressure off");
18
+ pipeline.resume();
19
+ sourcePaused = false;
20
+ }
21
+ } ,
22
+ objectMode: true
23
+ });
24
+
25
+ pipeline.on('data', (data: any) => {
26
+ const ok = readableStream.push(data.value);
27
+
28
+ if (!ok) {
29
+ logger.debug("backpressure on")
30
+ pipeline.pause();
31
+ sourcePaused = true;
32
+ }
33
+
34
+ recordNum++;
35
+
36
+ if (recordNum % 1000 === 0) {
37
+ logger.info(`record: ${recordNum}`);
38
+ }
39
+ });
40
+
41
+ pipeline.on('end', () => {
42
+ logger.info(`processed ${recordNum} records`);
43
+ readableStream.push(null);
44
+ });
45
+
46
+ return readableStream;
47
+ }
@@ -0,0 +1,47 @@
1
+ import { Readable } from "stream";
2
+ import * as readline from 'node:readline'
3
+ import log4js from 'log4js';
4
+
5
+ const logger = log4js.getLogger();
6
+
7
+ export function stream2readable(stream: Readable) : Readable {
8
+ let recordNum = 0;
9
+
10
+ const rl = readline.createInterface({input: stream, crlfDelay: Infinity});
11
+
12
+ let sourcePaused = false;
13
+
14
+ const readableStream = new Readable({
15
+ read() {
16
+ if (sourcePaused) {
17
+ logger.debug("backpressure off");
18
+ rl.resume();
19
+ sourcePaused = false;
20
+ }
21
+ } ,
22
+ objectMode: true
23
+ });
24
+
25
+ rl.on('line', (line) => {
26
+ const ok = readableStream.push(JSON.parse(line));
27
+
28
+ if (!ok) {
29
+ logger.debug("backpressure on");
30
+ rl.pause();
31
+ sourcePaused = true;
32
+ }
33
+
34
+ recordNum++;
35
+
36
+ if (recordNum % 1000 === 0) {
37
+ logger.info(`record: ${recordNum}`);
38
+ }
39
+ });
40
+
41
+ rl.on('close', () => {
42
+ readableStream.push(null);
43
+ logger.info(`processed ${recordNum} records`);
44
+ });
45
+
46
+ return readableStream;
47
+ }