@engine9-io/input-tools 1.9.11 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,45 +1,39 @@
1
- const { google } = require('googleapis');
2
- const fs = require('node:fs');
3
-
1
+ import { google } from "googleapis";
2
+ import fs from "node:fs";
4
3
  const fsp = fs.promises;
5
-
6
- function Worker() {}
7
-
4
+ function Worker() { }
8
5
  Worker.prototype.setAuth = async function () {
9
- const keyFile = process.env.GOOGLE_APPLICATION_CREDENTIALS;
10
- const settings = JSON.parse(await fsp.readFile(keyFile));
11
- if (!settings.subject_to_impersonate) throw new Error(`You should include subject_to_impersonate in file ${keyFile}`);
12
-
13
- const auth = new google.auth.GoogleAuth({
14
- clientOptions: {
15
- subject: settings.subject_to_impersonate,
16
- },
17
- keyFile,
18
- scopes: ['https://www.googleapis.com/auth/drive'],
19
- });
20
- google.options({
21
- auth,
22
- });
6
+ const keyFile = process.env.GOOGLE_APPLICATION_CREDENTIALS;
7
+ const settings = JSON.parse(await fsp.readFile(keyFile));
8
+ if (!settings.subject_to_impersonate)
9
+ throw new Error(`You should include subject_to_impersonate in file ${keyFile}`);
10
+ const auth = new google.auth.GoogleAuth({
11
+ clientOptions: {
12
+ subject: settings.subject_to_impersonate,
13
+ },
14
+ keyFile,
15
+ scopes: ['https://www.googleapis.com/auth/drive'],
16
+ });
17
+ google.options({
18
+ auth,
19
+ });
23
20
  };
24
-
25
21
  Worker.prototype.list = async function ({ path }) {
26
- await this.setAuth();
27
- const drive = google.drive({ version: 'v3' });
28
- const folderId = path;
29
- const q = `'${folderId}' in parents and trashed=false`;
30
- const raw = await drive.files.list({
31
- pageSize: 150,
32
- q,
33
- supportsAllDrives: true, // include share drives as well
34
- includeItemsFromAllDrives: true,
35
- });
36
-
37
- return raw.data?.files;
22
+ await this.setAuth();
23
+ const drive = google.drive({ version: 'v3' });
24
+ const folderId = path;
25
+ const q = `'${folderId}' in parents and trashed=false`;
26
+ const raw = await drive.files.list({
27
+ pageSize: 150,
28
+ q,
29
+ supportsAllDrives: true, // include share drives as well
30
+ includeItemsFromAllDrives: true,
31
+ });
32
+ return raw.data?.files;
38
33
  };
39
34
  Worker.prototype.list.metadata = {
40
- options: {
41
- path: {},
42
- },
35
+ options: {
36
+ path: {},
37
+ },
43
38
  };
44
-
45
- module.exports = Worker;
39
+ export default Worker;
package/file/Parquet.js CHANGED
@@ -1,149 +1,137 @@
1
- const parquet = require('@dsnp/parquetjs');
2
-
3
- const { Readable } = require('node:stream');
4
- const debug = require('debug')('Parquet');
5
- const { S3Client } = require('@aws-sdk/client-s3');
6
- const FileWorker = require('./FileUtilities');
7
-
8
- function Worker() {}
9
-
1
+ import parquet from "@dsnp/parquetjs";
2
+ import nodestream from "node:stream";
3
+ import debug$0 from "debug";
4
+ import clientS3 from "@aws-sdk/client-s3";
5
+ import FileWorker from "./FileUtilities.js";
6
+ const { Readable } = nodestream;
7
+ const debug = debug$0('Parquet');
8
+ const { S3Client } = clientS3;
9
+ function Worker() { }
10
10
  async function getReader(options) {
11
- const { filename } = options;
12
- if (!filename) throw new Error('filename is required');
13
- if (filename.indexOf('s3://') === 0) {
14
- const client = new S3Client({});
15
- const parts = filename.split('/');
16
-
17
- return parquet.ParquetReader.openS3(client, {
18
- Bucket: parts[2],
19
- Key: parts.slice(3).join('/')
20
- });
21
- }
22
- return parquet.ParquetReader.openFile(filename);
11
+ const { filename } = options;
12
+ if (!filename)
13
+ throw new Error('filename is required');
14
+ if (filename.indexOf('s3://') === 0) {
15
+ const client = new S3Client({});
16
+ const parts = filename.split('/');
17
+ return parquet.ParquetReader.openS3(client, {
18
+ Bucket: parts[2],
19
+ Key: parts.slice(3).join('/')
20
+ });
21
+ }
22
+ return parquet.ParquetReader.openFile(filename);
23
23
  }
24
-
25
24
  Worker.prototype.meta = async function (options) {
26
- const reader = await getReader(options);
27
- const schema = reader.getSchema();
28
- return {
29
- //stored as a buffer
30
- schema,
31
- records: parseInt(reader.metadata?.num_rows?.toString(), 10)
32
- };
33
- // getMetadata();
25
+ const reader = await getReader(options);
26
+ const schema = reader.getSchema();
27
+ return {
28
+ //stored as a buffer
29
+ schema,
30
+ records: parseInt(reader.metadata?.num_rows?.toString(), 10)
31
+ };
32
+ // getMetadata();
34
33
  };
35
34
  Worker.prototype.meta.metadata = {
36
- options: {
37
- path: {}
38
- }
35
+ options: {
36
+ path: {}
37
+ }
39
38
  };
40
39
  Worker.prototype.schema = async function (options) {
41
- const reader = await getReader(options);
42
- return reader.getSchema();
40
+ const reader = await getReader(options);
41
+ return reader.getSchema();
43
42
  };
44
43
  Worker.prototype.schema.metadata = {
45
- options: {
46
- path: {}
47
- }
44
+ options: {
45
+ path: {}
46
+ }
48
47
  };
49
-
50
48
  function cleanColumnName(name) {
51
- return name.toLowerCase().replace(/[^a-z0-9_]/g, '_');
49
+ return name.toLowerCase().replace(/[^a-z0-9_]/g, '_');
52
50
  }
53
-
54
51
  Worker.prototype.stream = async function (options) {
55
- const reader = await getReader(options);
56
- let columns;
57
- if (options.columns) {
58
- const { fieldList } = await this.schema(options);
59
- columns = [];
60
- let requestedColumns = options.columns;
61
- if (typeof options.columns === 'string') requestedColumns = options.columns.split(',').map((d) => d.trim());
62
- else requestedColumns = options.columns.map((d) => (d.name ? d.name.trim() : d.trim()));
63
- requestedColumns.forEach((c) => {
64
- const matchingCols = fieldList
65
- .filter((f) => f.name === c || cleanColumnName(f.name) === cleanColumnName(c))
66
- .map((f) => f.name);
67
- columns = columns.concat(matchingCols);
68
- });
69
- }
70
- let limit = 0;
71
- if (parseInt(options.limit, 10) === options.limit) limit = parseInt(options.limit, 10);
72
- // create a new cursor
73
- debug(`Reading parquet file ${options.filename} with columns ${columns?.join(',')} and limit ${limit}`);
74
- const cursor = reader.getCursor(columns);
75
-
76
- let counter = 0;
77
-
78
- const start = new Date().getTime();
79
-
80
- const stream = new Readable({
81
- objectMode: true,
82
- async read() {
83
- const token = await cursor.next();
84
- if (token) {
85
- counter += 1;
86
- if (limit && counter > limit) {
87
- debug(`Reached limit of ${limit}, stopping`);
88
- this.push(null);
89
- await reader.close();
90
- return;
91
- }
92
- if (counter % 10000 === 0) {
93
- let m = process.memoryUsage().heapTotal;
94
- const end = new Date().getTime();
95
- debug(
96
- `Read ${counter} ${(counter * 1000) / (end - start)}/sec, Node reported memory usage: ${
97
- m / 1024 / 1024
98
- } MBs`
99
- );
100
- }
101
- this.push(token);
102
- } else {
103
- await reader.close();
104
- this.push(null);
105
- }
52
+ const reader = await getReader(options);
53
+ let columns;
54
+ if (options.columns) {
55
+ const { fieldList } = await this.schema(options);
56
+ columns = [];
57
+ let requestedColumns = options.columns;
58
+ if (typeof options.columns === 'string')
59
+ requestedColumns = options.columns.split(',').map((d) => d.trim());
60
+ else
61
+ requestedColumns = options.columns.map((d) => (d.name ? d.name.trim() : d.trim()));
62
+ requestedColumns.forEach((c) => {
63
+ const matchingCols = fieldList
64
+ .filter((f) => f.name === c || cleanColumnName(f.name) === cleanColumnName(c))
65
+ .map((f) => f.name);
66
+ columns = columns.concat(matchingCols);
67
+ });
106
68
  }
107
- });
108
-
109
- return { stream };
69
+ let limit = 0;
70
+ if (parseInt(options.limit, 10) === options.limit)
71
+ limit = parseInt(options.limit, 10);
72
+ // create a new cursor
73
+ debug(`Reading parquet file ${options.filename} with columns ${columns?.join(',')} and limit ${limit}`);
74
+ const cursor = reader.getCursor(columns);
75
+ let counter = 0;
76
+ const start = new Date().getTime();
77
+ const stream = new Readable({
78
+ objectMode: true,
79
+ async read() {
80
+ const token = await cursor.next();
81
+ if (token) {
82
+ counter += 1;
83
+ if (limit && counter > limit) {
84
+ debug(`Reached limit of ${limit}, stopping`);
85
+ this.push(null);
86
+ await reader.close();
87
+ return;
88
+ }
89
+ if (counter % 10000 === 0) {
90
+ let m = process.memoryUsage().heapTotal;
91
+ const end = new Date().getTime();
92
+ debug(`Read ${counter} ${(counter * 1000) / (end - start)}/sec, Node reported memory usage: ${m / 1024 / 1024} MBs`);
93
+ }
94
+ this.push(token);
95
+ }
96
+ else {
97
+ await reader.close();
98
+ this.push(null);
99
+ }
100
+ }
101
+ });
102
+ return { stream };
110
103
  };
111
-
112
104
  Worker.prototype.stream.metadata = {
113
- options: {
114
- path: {}
115
- }
105
+ options: {
106
+ path: {}
107
+ }
116
108
  };
117
-
118
109
  Worker.prototype.toFile = async function (options) {
119
- const { stream } = await this.stream(options);
120
- const fworker = new FileWorker(this);
121
- return fworker.objectStreamToFile({ ...options, stream });
110
+ const { stream } = await this.stream(options);
111
+ const fworker = new FileWorker(this);
112
+ return fworker.objectStreamToFile({ ...options, stream });
122
113
  };
123
114
  Worker.prototype.toFile.metadata = {
124
- options: {
125
- path: {}
126
- }
115
+ options: {
116
+ path: {}
117
+ }
127
118
  };
128
-
129
119
  Worker.prototype.stats = async function (options) {
130
- const reader = await getReader(options);
131
- const schema = reader.getSchema();
132
- const fileMetadata = reader.getFileMetaData();
133
- const rowGroups = reader.getRowGroups();
134
-
135
- // const reader = await parquet.ParquetReader.openS3(client, getParams(options));
136
- // return reader.getSchema();
137
- return {
138
- schema,
139
- fileMetadata,
140
- rowGroups
141
- };
120
+ const reader = await getReader(options);
121
+ const schema = reader.getSchema();
122
+ const fileMetadata = reader.getFileMetaData();
123
+ const rowGroups = reader.getRowGroups();
124
+ // const reader = await parquet.ParquetReader.openS3(client, getParams(options));
125
+ // return reader.getSchema();
126
+ return {
127
+ schema,
128
+ fileMetadata,
129
+ rowGroups
130
+ };
142
131
  };
143
132
  Worker.prototype.stats.metadata = {
144
- options: {
145
- path: {}
146
- }
133
+ options: {
134
+ path: {}
135
+ }
147
136
  };
148
-
149
- module.exports = Worker;
137
+ export default Worker;
package/file/R2.js CHANGED
@@ -1,37 +1,32 @@
1
- const util = require('node:util');
2
- const {
3
- S3Client,
4
- } = require('@aws-sdk/client-s3');
5
- const S3 = require('./S3');
6
-
1
+ import util from "node:util";
2
+ import clientS3 from "@aws-sdk/client-s3";
3
+ import S3 from "./S3.js";
4
+ const { S3Client, } = clientS3;
7
5
  function R2(worker) {
8
- S3.call(this, worker);
9
- this.prefix='r2';
6
+ S3.call(this, worker);
7
+ this.prefix = 'r2';
10
8
  }
11
9
  util.inherits(R2, S3);
12
-
13
10
  R2.prototype.getClient = function () {
14
- const missing = ['CLOUDFLARE_R2_ACCOUNT_ID', 'CLOUDFLARE_R2_ACCESS_KEY_ID', 'CLOUDFLARE_R2_SECRET_ACCESS_KEY']
15
- .filter((r) => !process.env[r]);
16
- if (missing.length > 0) throw new Error(`Missing environment variables for Cloudflare access:${missing.join(',')}`);
17
- const ACCOUNT_ID = process.env.CLOUDFLARE_R2_ACCOUNT_ID;
18
- const ACCESS_KEY_ID = process.env.CLOUDFLARE_R2_ACCESS_KEY_ID;
19
- const SECRET_ACCESS_KEY = process.env.CLOUDFLARE_R2_SECRET_ACCESS_KEY;
20
-
21
- if (!this.client) {
22
- this.client = new S3Client({
23
- // R2 does not strictly require a region, but the SDK expects one. 'auto' works fine.
24
- region: 'auto',
25
- endpoint: `https://${ACCOUNT_ID}.r2.cloudflarestorage.com`,
26
- credentials: {
27
- accessKeyId: ACCESS_KEY_ID,
28
- secretAccessKey: SECRET_ACCESS_KEY,
29
- },
30
- forcePathStyle: true, // Important for R2 compatibility
31
-
32
- });
33
- }
34
- return this.client;
11
+ const missing = ['CLOUDFLARE_R2_ACCOUNT_ID', 'CLOUDFLARE_R2_ACCESS_KEY_ID', 'CLOUDFLARE_R2_SECRET_ACCESS_KEY']
12
+ .filter((r) => !process.env[r]);
13
+ if (missing.length > 0)
14
+ throw new Error(`Missing environment variables for Cloudflare access:${missing.join(',')}`);
15
+ const ACCOUNT_ID = process.env.CLOUDFLARE_R2_ACCOUNT_ID;
16
+ const ACCESS_KEY_ID = process.env.CLOUDFLARE_R2_ACCESS_KEY_ID;
17
+ const SECRET_ACCESS_KEY = process.env.CLOUDFLARE_R2_SECRET_ACCESS_KEY;
18
+ if (!this.client) {
19
+ this.client = new S3Client({
20
+ // R2 does not strictly require a region, but the SDK expects one. 'auto' works fine.
21
+ region: 'auto',
22
+ endpoint: `https://${ACCOUNT_ID}.r2.cloudflarestorage.com`,
23
+ credentials: {
24
+ accessKeyId: ACCESS_KEY_ID,
25
+ secretAccessKey: SECRET_ACCESS_KEY,
26
+ },
27
+ forcePathStyle: true, // Important for R2 compatibility
28
+ });
29
+ }
30
+ return this.client;
35
31
  };
36
-
37
- module.exports = R2;
32
+ export default R2;