s3-querier 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ const endPointRegexStr = '\\{endpoint:(?<endpoint>[a-z0-9-:/.]+)\\}';
2
+ const bucketRegexStr = '\\{bucket:(?<bucket>[a-z0-9-:/._]+)\\}';
3
+
4
+ /**
5
+ * Merges endpoints and bucket file values
6
+ *
7
+ * @param {DownloadSetting[]} settings File settings (endpoint/bucket) parsed from the query
8
+ * @returns {object[]} Settings merged on endpoint/buckets
9
+ */
10
+ export function mergeSettings(settings = []) {
11
+ const settingsMerged = settings.reduce((acc, setting) => {
12
+ const { file, cache, endpoint, bucket } = setting;
13
+ const key = `${endpoint}/${bucket}`;
14
+ const tokenRegex = /\{|\*/;
15
+
16
+ if (acc[key]) {
17
+ if (tokenRegex.test(file)) acc[key].filePatterns.push({ file, cache });
18
+ if (!tokenRegex.test(file)) acc[key].staticFiles.push({ file, cache });
19
+ return acc;
20
+ }
21
+
22
+ acc[key] = {};
23
+ acc[key].filePatterns = tokenRegex.test(file) ? [{ file, cache }] : [];
24
+ acc[key].staticFiles = !tokenRegex.test(file) ? [{ file, cache }] : [];
25
+ acc[key].endpoint = endpoint;
26
+ acc[key].bucket = bucket;
27
+ return acc;
28
+ }, {});
29
+
30
+ return Object.values(settingsMerged);
31
+ }
32
+
33
+ /**
34
+ * Removes file setting tokens
35
+ *
36
+ * @param {string} query The query
37
+ * @returns {string} The query with file setting tokens removed
38
+ */
39
+ export function removeFileSettingTokens(query = '') {
40
+ const endPointRegex = new RegExp(endPointRegexStr, 'gi');
41
+ const bucketPointRegex = new RegExp(bucketRegexStr, 'gi');
42
+ query = query.replace(endPointRegex, '');
43
+ query = query.replace(bucketPointRegex, '');
44
+ return query;
45
+ }
46
+
47
+ /**
48
+ * Removes double / in file paths. Edge case where bucket paths start with /
49
+ *
50
+ * @param {string} query The query
51
+ * @returns {string} The query with file setting tokens removed
52
+ */
53
+ export function removeDoubleFwdSlash(query = '') {
54
+ return query.replace(/\/\/+/g, '/');
55
+ }
56
+
57
+ /**
58
+ * Removes ?cache=(true|false) from the file paths in the query
59
+ *
60
+ * @param {string} query The query
61
+ * @returns {string} The query with file setting tokens removed
62
+ */
63
+ export function removeCacheSettings(query = '') {
64
+ return query.replace(/\?cache=(true|false)/gi, '');
65
+ }
@@ -0,0 +1,3 @@
1
+ import pino from 'pino';
2
+ const logger = pino();
3
+ export { logger };
@@ -0,0 +1,39 @@
1
+ export const GRAMMAR = `
2
+ FilePattern
3
+ = tokens:Token+ cache:CacheParam? {
4
+ if (cache) return [...tokens, cache];
5
+ return tokens;
6
+ }
7
+
8
+ Token
9
+ = LocationToken
10
+ / DateToken
11
+ / GlobToken
12
+ / LiteralText
13
+
14
+ LocationToken
15
+ = "{" _ "endpoint:" _ value:LocationValue _ "}" "/"? { return { type: "endpoint", value }; }
16
+ / "{" _ "bucket:" _ value:LocationValue _ "}" "/"? { return { type: "bucket", value }; }
17
+
18
+ LocationValue
19
+ = chars:[^} \t]+ { return chars.join(""); }
20
+
21
+ _ = [ \t]*
22
+
23
+ DateToken
24
+ = "{yyyy}" { return { type: "date", unit: "year" }; }
25
+ / "{MM}" { return { type: "date", unit: "month" }; }
26
+ / "{dd}" { return { type: "date", unit: "day" }; }
27
+ / "{hh}" { return { type: "date", unit: "hour" }; }
28
+ / "{mm}" { return { type: "date", unit: "minute" }; }
29
+ / "{ss}" { return { type: "date", unit: "second" }; }
30
+
31
+ GlobToken
32
+ = "*" { return { type: "glob" }; }
33
+
34
+ CacheParam
35
+ = "?cache=" v:("true" / "false") { return { type: "cache", value: v === "true" }; }
36
+
37
+ LiteralText
38
+ = chars:[^{*?]+ { return { type: "literal", value: chars.join("") }; }
39
+ `;
@@ -0,0 +1,58 @@
1
+ import peggy from 'peggy';
2
+ import { GRAMMAR } from './path-parser-grammar.js';
3
+
4
+ const DATE_UNIT_TOKENS = {
5
+ year: '{yyyy}',
6
+ month: '{MM}',
7
+ day: '{dd}',
8
+ hour: '{hh}',
9
+ minute: '{mm}',
10
+ second: '{ss}',
11
+ };
12
+
13
+ const compiledParser = peggy.generate(GRAMMAR);
14
+
15
+ /**
16
+ * Parses an S3 file path string (without surrounding SQL quotes) into its
17
+ * constituent tokens, then extracts endpoint, bucket, file path, and cache setting.
18
+ *
19
+ * @param {string} raw File path string as it appears inside the SQL quotes
20
+ * @returns {{ endpoint: string|null, bucket: string|null, file: string, cache: boolean }}
21
+ */
22
+ export function parseFilePath(raw) {
23
+ const tokens = compiledParser.parse(raw);
24
+ return {
25
+ endpoint: extractEndpoint(tokens),
26
+ bucket: extractBucket(tokens),
27
+ file: buildFilePath(tokens),
28
+ cache: extractCache(tokens),
29
+ };
30
+ }
31
+
32
+ /** Reconstructs the file path from tokens, dropping endpoint/bucket/cache */
33
+ function buildFilePath(tokens) {
34
+ return tokens
35
+ .filter((token) => token.type !== 'endpoint' && token.type !== 'bucket' && token.type !== 'cache')
36
+ .map(tokenToString)
37
+ .join('');
38
+ }
39
+
40
+ function tokenToString(token) {
41
+ if (token.type === 'literal') return token.value;
42
+ if (token.type === 'date') return DATE_UNIT_TOKENS[token.unit];
43
+ if (token.type === 'glob') return '*';
44
+ return '';
45
+ }
46
+
47
+ function extractEndpoint(tokens) {
48
+ return tokens.find((token) => token.type === 'endpoint')?.value ?? null;
49
+ }
50
+
51
+ function extractBucket(tokens) {
52
+ return tokens.find((token) => token.type === 'bucket')?.value ?? null;
53
+ }
54
+
55
+ function extractCache(tokens) {
56
+ const cacheToken = tokens.find((token) => token.type === 'cache');
57
+ return cacheToken ? cacheToken.value : true;
58
+ }
@@ -0,0 +1,31 @@
1
+ import { createRequire } from 'module';
2
+
3
+ const require = createRequire(import.meta.url);
4
+ const Parser = require('tree-sitter');
5
+ const SQL = require('@derekstride/tree-sitter-sql');
6
+
7
+ const FILE_QUERY = new Parser.Query(
8
+ SQL,
9
+ `
10
+ (invocation
11
+ (object_reference (identifier) @func (#match? @func "^read_"))
12
+ (term (literal) @file (#match? @file "^'"))
13
+ )
14
+ `,
15
+ );
16
+
17
+ const parser = new Parser();
18
+ parser.setLanguage(SQL);
19
+
20
+ export function extractFileReferences(query) {
21
+ const tree = parser.parse(query);
22
+ const seen = new Set();
23
+ return FILE_QUERY.captures(tree.rootNode)
24
+ .filter((capture) => capture.name === 'file')
25
+ .map((capture) => ({ raw: capture.node.text.slice(1, -1) }))
26
+ .filter(({ raw }) => {
27
+ if (seen.has(raw)) return false;
28
+ seen.add(raw);
29
+ return true;
30
+ });
31
+ }