s3-querier 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +203 -0
- package/docs/s3-querier.md +196 -0
- package/package.json +71 -0
- package/src/duck-db/index.js +57 -0
- package/src/plugins/avro/avro-plugin.js +64 -0
- package/src/plugins/query-finalizer/query-finalizer.js +41 -0
- package/src/plugins/query-parser/query-parser.js +33 -0
- package/src/s3/auth/ibm-iam-client.js +21 -0
- package/src/s3/auth/ibm-iam-token-manager.js +40 -0
- package/src/s3/s3.js +427 -0
- package/src/s3-querier.js +107 -0
- package/src/utils/bigint-replacer.js +13 -0
- package/src/utils/date-regex/date-regex.js +52 -0
- package/src/utils/file-path-builder/file-path-builder.js +55 -0
- package/src/utils/file-settings/file-settings.js +65 -0
- package/src/utils/logger.js +3 -0
- package/src/utils/path-parser/path-parser-grammar.js +39 -0
- package/src/utils/path-parser/path-parser.js +58 -0
- package/src/utils/sql-parser/sql-parser.js +31 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
const endPointRegexStr = '\\{endpoint:(?<endpoint>[a-z0-9-:/.]+)\\}';
|
|
2
|
+
const bucketRegexStr = '\\{bucket:(?<bucket>[a-z0-9-:/._]+)\\}';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Merges endpoints and bucket file values
|
|
6
|
+
*
|
|
7
|
+
* @param {DownloadSetting[]} settings File settings (endpoint/bucket) parsed from the query
|
|
8
|
+
* @returns {object[]} Settings merged on endpoint/buckets
|
|
9
|
+
*/
|
|
10
|
+
export function mergeSettings(settings = []) {
|
|
11
|
+
const settingsMerged = settings.reduce((acc, setting) => {
|
|
12
|
+
const { file, cache, endpoint, bucket } = setting;
|
|
13
|
+
const key = `${endpoint}/${bucket}`;
|
|
14
|
+
const tokenRegex = /\{|\*/;
|
|
15
|
+
|
|
16
|
+
if (acc[key]) {
|
|
17
|
+
if (tokenRegex.test(file)) acc[key].filePatterns.push({ file, cache });
|
|
18
|
+
if (!tokenRegex.test(file)) acc[key].staticFiles.push({ file, cache });
|
|
19
|
+
return acc;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
acc[key] = {};
|
|
23
|
+
acc[key].filePatterns = tokenRegex.test(file) ? [{ file, cache }] : [];
|
|
24
|
+
acc[key].staticFiles = !tokenRegex.test(file) ? [{ file, cache }] : [];
|
|
25
|
+
acc[key].endpoint = endpoint;
|
|
26
|
+
acc[key].bucket = bucket;
|
|
27
|
+
return acc;
|
|
28
|
+
}, {});
|
|
29
|
+
|
|
30
|
+
return Object.values(settingsMerged);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Removes file setting tokens
|
|
35
|
+
*
|
|
36
|
+
* @param {string} query The query
|
|
37
|
+
* @returns {string} The query with file setting tokens removed
|
|
38
|
+
*/
|
|
39
|
+
export function removeFileSettingTokens(query = '') {
|
|
40
|
+
const endPointRegex = new RegExp(endPointRegexStr, 'gi');
|
|
41
|
+
const bucketPointRegex = new RegExp(bucketRegexStr, 'gi');
|
|
42
|
+
query = query.replace(endPointRegex, '');
|
|
43
|
+
query = query.replace(bucketPointRegex, '');
|
|
44
|
+
return query;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Removes double / in file paths. Edge case where bucket paths start with /
|
|
49
|
+
*
|
|
50
|
+
* @param {string} query The query
|
|
51
|
+
* @returns {string} The query with file setting tokens removed
|
|
52
|
+
*/
|
|
53
|
+
export function removeDoubleFwdSlash(query = '') {
|
|
54
|
+
return query.replace(/\/\/+/g, '/');
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Removes ?cache=(true|false) from the file paths in the query
|
|
59
|
+
*
|
|
60
|
+
* @param {string} query The query
|
|
61
|
+
* @returns {string} The query with file setting tokens removed
|
|
62
|
+
*/
|
|
63
|
+
export function removeCacheSettings(query = '') {
|
|
64
|
+
return query.replace(/\?cache=(true|false)/gi, '');
|
|
65
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
export const GRAMMAR = `
|
|
2
|
+
FilePattern
|
|
3
|
+
= tokens:Token+ cache:CacheParam? {
|
|
4
|
+
if (cache) return [...tokens, cache];
|
|
5
|
+
return tokens;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
Token
|
|
9
|
+
= LocationToken
|
|
10
|
+
/ DateToken
|
|
11
|
+
/ GlobToken
|
|
12
|
+
/ LiteralText
|
|
13
|
+
|
|
14
|
+
LocationToken
|
|
15
|
+
= "{" _ "endpoint:" _ value:LocationValue _ "}" "/"? { return { type: "endpoint", value }; }
|
|
16
|
+
/ "{" _ "bucket:" _ value:LocationValue _ "}" "/"? { return { type: "bucket", value }; }
|
|
17
|
+
|
|
18
|
+
LocationValue
|
|
19
|
+
= chars:[^} \t]+ { return chars.join(""); }
|
|
20
|
+
|
|
21
|
+
_ = [ \t]*
|
|
22
|
+
|
|
23
|
+
DateToken
|
|
24
|
+
= "{yyyy}" { return { type: "date", unit: "year" }; }
|
|
25
|
+
/ "{MM}" { return { type: "date", unit: "month" }; }
|
|
26
|
+
/ "{dd}" { return { type: "date", unit: "day" }; }
|
|
27
|
+
/ "{hh}" { return { type: "date", unit: "hour" }; }
|
|
28
|
+
/ "{mm}" { return { type: "date", unit: "minute" }; }
|
|
29
|
+
/ "{ss}" { return { type: "date", unit: "second" }; }
|
|
30
|
+
|
|
31
|
+
GlobToken
|
|
32
|
+
= "*" { return { type: "glob" }; }
|
|
33
|
+
|
|
34
|
+
CacheParam
|
|
35
|
+
= "?cache=" v:("true" / "false") { return { type: "cache", value: v === "true" }; }
|
|
36
|
+
|
|
37
|
+
LiteralText
|
|
38
|
+
= chars:[^{*?]+ { return { type: "literal", value: chars.join("") }; }
|
|
39
|
+
`;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import peggy from 'peggy';
|
|
2
|
+
import { GRAMMAR } from './path-parser-grammar.js';
|
|
3
|
+
|
|
4
|
+
const DATE_UNIT_TOKENS = {
|
|
5
|
+
year: '{yyyy}',
|
|
6
|
+
month: '{MM}',
|
|
7
|
+
day: '{dd}',
|
|
8
|
+
hour: '{hh}',
|
|
9
|
+
minute: '{mm}',
|
|
10
|
+
second: '{ss}',
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
const compiledParser = peggy.generate(GRAMMAR);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Parses an S3 file path string (without surrounding SQL quotes) into its
|
|
17
|
+
* constituent tokens, then extracts endpoint, bucket, file path, and cache setting.
|
|
18
|
+
*
|
|
19
|
+
* @param {string} raw File path string as it appears inside the SQL quotes
|
|
20
|
+
* @returns {{ endpoint: string|null, bucket: string|null, file: string, cache: boolean }}
|
|
21
|
+
*/
|
|
22
|
+
export function parseFilePath(raw) {
|
|
23
|
+
const tokens = compiledParser.parse(raw);
|
|
24
|
+
return {
|
|
25
|
+
endpoint: extractEndpoint(tokens),
|
|
26
|
+
bucket: extractBucket(tokens),
|
|
27
|
+
file: buildFilePath(tokens),
|
|
28
|
+
cache: extractCache(tokens),
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Reconstructs the file path from tokens, dropping endpoint/bucket/cache */
|
|
33
|
+
function buildFilePath(tokens) {
|
|
34
|
+
return tokens
|
|
35
|
+
.filter((token) => token.type !== 'endpoint' && token.type !== 'bucket' && token.type !== 'cache')
|
|
36
|
+
.map(tokenToString)
|
|
37
|
+
.join('');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function tokenToString(token) {
|
|
41
|
+
if (token.type === 'literal') return token.value;
|
|
42
|
+
if (token.type === 'date') return DATE_UNIT_TOKENS[token.unit];
|
|
43
|
+
if (token.type === 'glob') return '*';
|
|
44
|
+
return '';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function extractEndpoint(tokens) {
|
|
48
|
+
return tokens.find((token) => token.type === 'endpoint')?.value ?? null;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function extractBucket(tokens) {
|
|
52
|
+
return tokens.find((token) => token.type === 'bucket')?.value ?? null;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function extractCache(tokens) {
|
|
56
|
+
const cacheToken = tokens.find((token) => token.type === 'cache');
|
|
57
|
+
return cacheToken ? cacheToken.value : true;
|
|
58
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
3
|
+
const require = createRequire(import.meta.url);
|
|
4
|
+
const Parser = require('tree-sitter');
|
|
5
|
+
const SQL = require('@derekstride/tree-sitter-sql');
|
|
6
|
+
|
|
7
|
+
const FILE_QUERY = new Parser.Query(
|
|
8
|
+
SQL,
|
|
9
|
+
`
|
|
10
|
+
(invocation
|
|
11
|
+
(object_reference (identifier) @func (#match? @func "^read_"))
|
|
12
|
+
(term (literal) @file (#match? @file "^'"))
|
|
13
|
+
)
|
|
14
|
+
`,
|
|
15
|
+
);
|
|
16
|
+
|
|
17
|
+
const parser = new Parser();
|
|
18
|
+
parser.setLanguage(SQL);
|
|
19
|
+
|
|
20
|
+
export function extractFileReferences(query) {
|
|
21
|
+
const tree = parser.parse(query);
|
|
22
|
+
const seen = new Set();
|
|
23
|
+
return FILE_QUERY.captures(tree.rootNode)
|
|
24
|
+
.filter((capture) => capture.name === 'file')
|
|
25
|
+
.map((capture) => ({ raw: capture.node.text.slice(1, -1) }))
|
|
26
|
+
.filter(({ raw }) => {
|
|
27
|
+
if (seen.has(raw)) return false;
|
|
28
|
+
seen.add(raw);
|
|
29
|
+
return true;
|
|
30
|
+
});
|
|
31
|
+
}
|