@engine9-io/input-tools 1.8.1 → 1.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ const { Readable, Transform, PassThrough, Writable } = require('node:stream');
7
7
  const { pipeline } = require('node:stream/promises');
8
8
  const { stringify } = require('csv');
9
9
 
10
- const debug = require('debug')('FileWorker');
10
+ const debug = require('debug')('@engine9-io/file');
11
11
 
12
12
  const { getXlsxStream } = require('xlstream');
13
13
  const csv = require('csv');
package/file/Parquet.js CHANGED
@@ -1,7 +1,7 @@
1
1
  const parquet = require('@dsnp/parquetjs');
2
2
 
3
3
  const { Readable } = require('node:stream');
4
- const debug = require('debug')('ParquetWorker');
4
+ const debug = require('debug')('Parquet');
5
5
  const { S3Client } = require('@aws-sdk/client-s3');
6
6
  const FileWorker = require('./FileUtilities');
7
7
 
@@ -16,7 +16,7 @@ async function getReader(options) {
16
16
 
17
17
  return parquet.ParquetReader.openS3(client, {
18
18
  Bucket: parts[2],
19
- Key: parts.slice(3).join('/'),
19
+ Key: parts.slice(3).join('/')
20
20
  });
21
21
  }
22
22
  return parquet.ParquetReader.openFile(filename);
@@ -25,14 +25,14 @@ async function getReader(options) {
25
25
  Worker.prototype.meta = async function (options) {
26
26
  const reader = await getReader(options);
27
27
  return {
28
- records: String(reader.metadata?.num_rows),
28
+ records: String(reader.metadata?.num_rows)
29
29
  };
30
30
  // getMetadata();
31
31
  };
32
32
  Worker.prototype.meta.metadata = {
33
33
  options: {
34
- path: {},
35
- },
34
+ path: {}
35
+ }
36
36
  };
37
37
  Worker.prototype.schema = async function (options) {
38
38
  const reader = await getReader(options);
@@ -40,8 +40,8 @@ Worker.prototype.schema = async function (options) {
40
40
  };
41
41
  Worker.prototype.schema.metadata = {
42
42
  options: {
43
- path: {},
44
- },
43
+ path: {}
44
+ }
45
45
  };
46
46
 
47
47
  function cleanColumnName(name) {
@@ -49,8 +49,6 @@ function cleanColumnName(name) {
49
49
  }
50
50
 
51
51
  Worker.prototype.stream = async function (options) {
52
- const stream = new Readable({ objectMode: true });
53
-
54
52
  const reader = await getReader(options);
55
53
  let columns;
56
54
  if (options.columns) {
@@ -60,9 +58,9 @@ Worker.prototype.stream = async function (options) {
60
58
  if (typeof options.columns === 'string') requestedColumns = options.columns.split(',').map((d) => d.trim());
61
59
  else requestedColumns = options.columns.map((d) => (d.name ? d.name.trim() : d.trim()));
62
60
  requestedColumns.forEach((c) => {
63
- const matchingCols = fieldList.filter((f) => (
64
- f.name === c || cleanColumnName(f.name) === cleanColumnName(c)
65
- )).map((f) => f.name);
61
+ const matchingCols = fieldList
62
+ .filter((f) => f.name === c || cleanColumnName(f.name) === cleanColumnName(c))
63
+ .map((f) => f.name);
66
64
  columns = columns.concat(matchingCols);
67
65
  });
68
66
  }
@@ -72,35 +70,46 @@ Worker.prototype.stream = async function (options) {
72
70
  debug(`Reading parquet file ${options.filename} with columns ${columns?.join(',')} and limit ${limit}`);
73
71
  const cursor = reader.getCursor(columns);
74
72
 
75
- // read all records from the file and print them
76
- let record = null;
77
73
  let counter = 0;
78
74
 
79
75
  const start = new Date().getTime();
80
- do {
81
- // eslint-disable-next-line no-await-in-loop
82
- record = await cursor.next();
83
- counter += 1;
84
- if (limit && counter > limit) {
85
- debug(`Reached limit of ${limit}, stopping`);
86
- break;
87
- }
88
- if (counter % 5000 === 0) {
89
- const end = new Date().getTime();
90
- debug(`Read ${counter} ${(counter * 1000) / (end - start)}/sec `);
76
+
77
+ const stream = new Readable({
78
+ objectMode: true,
79
+ async read() {
80
+ const token = await cursor.next();
81
+ if (token) {
82
+ counter += 1;
83
+ if (limit && counter > limit) {
84
+ debug(`Reached limit of ${limit}, stopping`);
85
+ this.push(null);
86
+ await reader.close();
87
+ return;
88
+ }
89
+ if (counter % 10000 === 0) {
90
+ let m = process.memoryUsage().heapTotal;
91
+ const end = new Date().getTime();
92
+ debug(
93
+ `Read ${counter} ${(counter * 1000) / (end - start)}/sec, Node reported memory usage: ${
94
+ m / 1024 / 1024
95
+ } MBs`
96
+ );
97
+ }
98
+ this.push(token);
99
+ } else {
100
+ await reader.close();
101
+ this.push(null);
102
+ }
91
103
  }
92
- stream.push(record);
93
- } while (record);
94
- stream.push(null);
95
- await reader.close();
104
+ });
96
105
 
97
106
  return { stream };
98
107
  };
99
108
 
100
109
  Worker.prototype.stream.metadata = {
101
110
  options: {
102
- path: {},
103
- },
111
+ path: {}
112
+ }
104
113
  };
105
114
 
106
115
  Worker.prototype.toFile = async function (options) {
@@ -110,8 +119,8 @@ Worker.prototype.toFile = async function (options) {
110
119
  };
111
120
  Worker.prototype.toFile.metadata = {
112
121
  options: {
113
- path: {},
114
- },
122
+ path: {}
123
+ }
115
124
  };
116
125
 
117
126
  Worker.prototype.stats = async function (options) {
@@ -125,13 +134,13 @@ Worker.prototype.stats = async function (options) {
125
134
  return {
126
135
  schema,
127
136
  fileMetadata,
128
- rowGroups,
137
+ rowGroups
129
138
  };
130
139
  };
131
140
  Worker.prototype.stats.metadata = {
132
141
  options: {
133
- path: {},
134
- },
142
+ path: {}
143
+ }
135
144
  };
136
145
 
137
146
  module.exports = Worker;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@engine9-io/input-tools",
3
- "version": "1.8.1",
3
+ "version": "1.8.3",
4
4
  "description": "Tools for dealing with Engine9 inputs",
5
5
  "main": "index.js",
6
6
  "scripts": {