mdb-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdb/__init__.py +0 -0
- mdb/atomic.py +25 -0
- mdb/data/SKILL.md +95 -0
- mdb/data/__init__.py +0 -0
- mdb/discovery.py +70 -0
- mdb/filelock.py +76 -0
- mdb/formatter.py +101 -0
- mdb/init.py +150 -0
- mdb/mdb.py +1214 -0
- mdb/models.py +81 -0
- mdb/parser.py +609 -0
- mdb/puller.py +212 -0
- mdb/validators.py +46 -0
- mdb_cli-0.1.0.dist-info/METADATA +220 -0
- mdb_cli-0.1.0.dist-info/RECORD +18 -0
- mdb_cli-0.1.0.dist-info/WHEEL +4 -0
- mdb_cli-0.1.0.dist-info/entry_points.txt +2 -0
- mdb_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
mdb/models.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Data model classes for mdb tools."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class FeedMarker:
|
|
8
|
+
"""A parsed feed marker found in a markdown file."""
|
|
9
|
+
line_number: int
|
|
10
|
+
scope: str
|
|
11
|
+
raw_query: str
|
|
12
|
+
table_name: str | None
|
|
13
|
+
direction: str = "🌀"
|
|
14
|
+
end_line_number: int | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class MarkdownTable:
|
|
19
|
+
"""A parsed markdown table immediately following a query marker."""
|
|
20
|
+
columns: list[str]
|
|
21
|
+
rows: list[list[str]]
|
|
22
|
+
start_line: int
|
|
23
|
+
end_line: int
|
|
24
|
+
column_types: list[str] | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class FeedTask:
|
|
29
|
+
"""A marker and its associated table, ready for database execution."""
|
|
30
|
+
marker: FeedMarker
|
|
31
|
+
table: MarkdownTable
|
|
32
|
+
resolved_db_path: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FeedResult:
|
|
37
|
+
"""The outcome of executing a single feed task."""
|
|
38
|
+
task: FeedTask
|
|
39
|
+
success: bool
|
|
40
|
+
error: str | None = None
|
|
41
|
+
rows_written: int = 0
|
|
42
|
+
skipped: bool = False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class TapMarker:
|
|
47
|
+
"""A parsed tap marker found in a markdown file."""
|
|
48
|
+
line_number: int
|
|
49
|
+
scope: str
|
|
50
|
+
raw_query: str
|
|
51
|
+
direction: str = "💎"
|
|
52
|
+
end_line_number: int | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class TapResult:
|
|
57
|
+
"""The outcome of executing a single tap query."""
|
|
58
|
+
marker: TapMarker
|
|
59
|
+
success: bool
|
|
60
|
+
columns: list[str] = field(default_factory=list)
|
|
61
|
+
rows: list[list[str]] = field(default_factory=list)
|
|
62
|
+
error: str | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class FileMarkers:
|
|
67
|
+
"""All parsed markers and content for a single markdown file."""
|
|
68
|
+
filepath: str
|
|
69
|
+
content: str
|
|
70
|
+
lines: list[str]
|
|
71
|
+
tap_markers: list = field(default_factory=list)
|
|
72
|
+
feed_markers: list = field(default_factory=list)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class FeedConflict:
|
|
77
|
+
"""A detected cross-file feed conflict."""
|
|
78
|
+
scope: str
|
|
79
|
+
resolved_path: str
|
|
80
|
+
table_name: str
|
|
81
|
+
conflicting: list # list of (filepath, line_number) tuples
|
mdb/parser.py
ADDED
|
@@ -0,0 +1,609 @@
|
|
|
1
|
+
"""Markdown parsing: markers, tables, table name extraction."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from mdb.models import FeedMarker, MarkdownTable, TapMarker
|
|
9
|
+
|
|
10
|
+
MDB_DIR = ".mdb"
|
|
11
|
+
IMPLICIT_DB_FILENAME = "_.db"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def compute_include_hash(include_paths: str) -> str:
|
|
15
|
+
"""Compute a deterministic MD5 hash from the --include argument value.
|
|
16
|
+
|
|
17
|
+
Splits on whitespace, sorts alphabetically, joins with single space,
|
|
18
|
+
and returns the MD5 hex digest. Duplicates are preserved (no dedup).
|
|
19
|
+
Caller must ensure include_paths is non-empty and non-whitespace-only.
|
|
20
|
+
"""
|
|
21
|
+
tokens = include_paths.split()
|
|
22
|
+
tokens.sort()
|
|
23
|
+
normalized = " ".join(tokens)
|
|
24
|
+
return hashlib.md5(normalized.encode("utf-8")).hexdigest()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def resolve_scope_path(scope: str, md_filepath: str, include_paths: str | None = None) -> str:
|
|
28
|
+
"""Resolve a database path from a named scope and optional include paths.
|
|
29
|
+
|
|
30
|
+
Produces a two-segment filename <include-part>-<scope-part>.db in the
|
|
31
|
+
.mdb/ directory under the current working directory.
|
|
32
|
+
|
|
33
|
+
Filename resolution:
|
|
34
|
+
- Both unset: _.db
|
|
35
|
+
- Scope only: _-<md5(scope)>.db
|
|
36
|
+
- Include only: <include-hash>-_.db
|
|
37
|
+
- Both set: <include-hash>-<md5(scope)>.db
|
|
38
|
+
|
|
39
|
+
The md_filepath parameter is retained for API compatibility but is not
|
|
40
|
+
used for resolution. Empty/whitespace include_paths is treated as unset.
|
|
41
|
+
"""
|
|
42
|
+
has_scope = bool(scope.strip())
|
|
43
|
+
has_include = bool(include_paths and include_paths.strip())
|
|
44
|
+
|
|
45
|
+
if not has_scope and not has_include:
|
|
46
|
+
filename = IMPLICIT_DB_FILENAME
|
|
47
|
+
elif has_scope and not has_include:
|
|
48
|
+
scope_hash = hashlib.md5(scope.encode("utf-8")).hexdigest()
|
|
49
|
+
filename = f"_-{scope_hash}.db"
|
|
50
|
+
elif not has_scope and has_include:
|
|
51
|
+
include_hash = compute_include_hash(include_paths)
|
|
52
|
+
filename = f"{include_hash}-_.db"
|
|
53
|
+
else:
|
|
54
|
+
include_hash = compute_include_hash(include_paths)
|
|
55
|
+
scope_hash = hashlib.md5(scope.encode("utf-8")).hexdigest()
|
|
56
|
+
filename = f"{include_hash}-{scope_hash}.db"
|
|
57
|
+
|
|
58
|
+
return os.path.normpath(os.path.join(os.getcwd(), MDB_DIR, filename))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Regex for FROM <table_name> (case-insensitive)
|
|
62
|
+
_FROM_RE = re.compile(r'\bFROM\s+(\w+)', re.IGNORECASE)
|
|
63
|
+
|
|
64
|
+
# Regex for feed marker (🌀): `💾 [scope] 🌀 <query>`
|
|
65
|
+
# Includes the table below as part of the dataset (table required -- defines column schema)
|
|
66
|
+
# scope group is optional to support implicit scope (e.g. `💾 🌀 SELECT ...`)
|
|
67
|
+
_MARKER_RE = re.compile(
|
|
68
|
+
r'^`💾\s*(.*?)\s*🌀\s*(.+?)`\s*$'
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Regex for tap marker (💎): `💾 [scope] 💎 <query>`
|
|
72
|
+
# Executes query on dataset and renders results as the table below (table optional -- auto-generated if absent)
|
|
73
|
+
# scope group is optional to support implicit scope (e.g. `💾 💎 SELECT ...`)
|
|
74
|
+
_READ_MARKER_RE = re.compile(
|
|
75
|
+
r'^`💾\s*(.*?)\s*💎\s*(.+?)`\s*$'
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Regex for bare 💾 marker header (inside codeblock, not backtick-wrapped)
|
|
79
|
+
# scope group is optional to support implicit scope (e.g. `💾 💎`)
|
|
80
|
+
_MARKER_HEADER_RE = re.compile(
|
|
81
|
+
r'^💾\s*(.*?)\s*(💎|🌀)\s*(.*?)\s*$'
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Regex for opening fence: 0-3 spaces indent, 3+ backticks or tildes, optional language tag
|
|
85
|
+
_OPENING_FENCE_RE = re.compile(
|
|
86
|
+
r'^( {0,3})((`{3,})|~{3,})([^`]*)?$'
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def extract_table_name(query: str) -> str | None:
|
|
90
|
+
"""Extract the target table name from a SELECT query string.
|
|
91
|
+
|
|
92
|
+
Matches FROM <identifier> pattern (case-insensitive).
|
|
93
|
+
Returns the first identifier after FROM, or None if unparseable.
|
|
94
|
+
"""
|
|
95
|
+
match = _FROM_RE.search(query)
|
|
96
|
+
if match:
|
|
97
|
+
return match.group(1)
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _parse_marker_header(line: str) -> tuple[str, str, str] | None:
|
|
102
|
+
"""Parse a bare 💾 marker header line (not backtick-wrapped).
|
|
103
|
+
|
|
104
|
+
Returns (scope, direction, sql_tail) or None.
|
|
105
|
+
Direction is exactly '💎' or '🌀'.
|
|
106
|
+
"""
|
|
107
|
+
match = _MARKER_HEADER_RE.match(line.strip())
|
|
108
|
+
if match:
|
|
109
|
+
scope = match.group(1).strip()
|
|
110
|
+
direction = match.group(2)
|
|
111
|
+
sql_tail = match.group(3).strip()
|
|
112
|
+
return (scope, direction, sql_tail)
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _is_opening_fence(line: str) -> tuple[str, int, int] | None:
|
|
117
|
+
"""Check if a line is a fenced codeblock opening fence.
|
|
118
|
+
|
|
119
|
+
Returns (fence_char, fence_length, indent) or None.
|
|
120
|
+
Matches ``` or ~~~ with 3+ chars, 0-3 spaces indent, optional language tag.
|
|
121
|
+
"""
|
|
122
|
+
match = _OPENING_FENCE_RE.match(line.rstrip())
|
|
123
|
+
if not match:
|
|
124
|
+
return None
|
|
125
|
+
indent_str = match.group(1)
|
|
126
|
+
fence_str = match.group(2)
|
|
127
|
+
fence_char = fence_str[0]
|
|
128
|
+
fence_length = len(fence_str)
|
|
129
|
+
return (fence_char, fence_length, len(indent_str))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _is_closing_fence(line: str, fence_char: str, fence_length: int) -> bool:
|
|
133
|
+
"""Check if a line is a closing fence matching the opening fence.
|
|
134
|
+
|
|
135
|
+
Closing fence: same character, >= same length, 0-3 spaces indent, no other content.
|
|
136
|
+
"""
|
|
137
|
+
stripped = line.rstrip()
|
|
138
|
+
# Count leading spaces (0-3 allowed)
|
|
139
|
+
content = stripped.lstrip(' ')
|
|
140
|
+
indent = len(stripped) - len(content)
|
|
141
|
+
if indent > 3:
|
|
142
|
+
return False
|
|
143
|
+
# Must be only fence chars
|
|
144
|
+
if not content:
|
|
145
|
+
return False
|
|
146
|
+
if not all(c == fence_char for c in content):
|
|
147
|
+
return False
|
|
148
|
+
if len(content) < fence_length:
|
|
149
|
+
return False
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _find_all_markers(content: str) -> list[TapMarker | FeedMarker]:
|
|
154
|
+
"""Scan content for both single-line and codeblock markers in a single pass.
|
|
155
|
+
|
|
156
|
+
Returns all markers (read and write) in document order.
|
|
157
|
+
"""
|
|
158
|
+
lines = content.splitlines()
|
|
159
|
+
markers = []
|
|
160
|
+
|
|
161
|
+
# State machine
|
|
162
|
+
state = "SCANNING"
|
|
163
|
+
fence_char = None
|
|
164
|
+
fence_length = 0
|
|
165
|
+
fence_start_line = 0 # 1-based
|
|
166
|
+
header_info = None # (scope, direction, sql_tail)
|
|
167
|
+
sql_lines = []
|
|
168
|
+
|
|
169
|
+
for line_num_0, line in enumerate(lines):
|
|
170
|
+
line_num = line_num_0 + 1 # 1-based
|
|
171
|
+
|
|
172
|
+
if state == "SCANNING":
|
|
173
|
+
# Check for opening fence
|
|
174
|
+
fence_info = _is_opening_fence(line)
|
|
175
|
+
if fence_info is not None:
|
|
176
|
+
fence_char, fence_length, _ = fence_info
|
|
177
|
+
fence_start_line = line_num
|
|
178
|
+
state = "CHECK_FIRST_LINE"
|
|
179
|
+
header_info = None
|
|
180
|
+
sql_lines = []
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
# Check for single-line backtick markers
|
|
184
|
+
stripped = line.strip()
|
|
185
|
+
read_match = _READ_MARKER_RE.match(stripped)
|
|
186
|
+
if read_match:
|
|
187
|
+
scope = read_match.group(1).strip()
|
|
188
|
+
raw_query = read_match.group(2).strip()
|
|
189
|
+
markers.append(TapMarker(
|
|
190
|
+
line_number=line_num,
|
|
191
|
+
scope=scope,
|
|
192
|
+
raw_query=raw_query,
|
|
193
|
+
direction="💎",
|
|
194
|
+
))
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
write_match = _MARKER_RE.match(stripped)
|
|
198
|
+
if write_match:
|
|
199
|
+
scope = write_match.group(1).strip()
|
|
200
|
+
raw_query = write_match.group(2).strip()
|
|
201
|
+
table_name = extract_table_name(raw_query)
|
|
202
|
+
markers.append(FeedMarker(
|
|
203
|
+
line_number=line_num,
|
|
204
|
+
scope=scope,
|
|
205
|
+
raw_query=raw_query,
|
|
206
|
+
table_name=table_name,
|
|
207
|
+
direction="🌀",
|
|
208
|
+
))
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
elif state == "CHECK_FIRST_LINE":
|
|
212
|
+
# Check if this line is a closing fence (empty codeblock)
|
|
213
|
+
if _is_closing_fence(line, fence_char, fence_length):
|
|
214
|
+
state = "SCANNING"
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Check if first content line is a 💾 marker header
|
|
218
|
+
header_info = _parse_marker_header(line)
|
|
219
|
+
if header_info is not None:
|
|
220
|
+
sql_lines = []
|
|
221
|
+
if header_info[2]: # sql_tail
|
|
222
|
+
sql_lines.append(header_info[2])
|
|
223
|
+
state = "IN_CODEBLOCK"
|
|
224
|
+
else:
|
|
225
|
+
# Not an mdb codeblock - skip until closing fence
|
|
226
|
+
state = "SKIP_CODEBLOCK"
|
|
227
|
+
|
|
228
|
+
elif state == "IN_CODEBLOCK":
|
|
229
|
+
if _is_closing_fence(line, fence_char, fence_length):
|
|
230
|
+
# Assemble query and emit marker
|
|
231
|
+
assembled = "\n".join(sql_lines).strip()
|
|
232
|
+
if not assembled:
|
|
233
|
+
print(
|
|
234
|
+
f"Warning: Empty query in codeblock at line {fence_start_line}, skipping",
|
|
235
|
+
file=sys.stderr,
|
|
236
|
+
)
|
|
237
|
+
else:
|
|
238
|
+
scope, direction, _ = header_info
|
|
239
|
+
if direction == "💎":
|
|
240
|
+
markers.append(TapMarker(
|
|
241
|
+
line_number=fence_start_line,
|
|
242
|
+
scope=scope,
|
|
243
|
+
raw_query=assembled,
|
|
244
|
+
direction="💎",
|
|
245
|
+
end_line_number=line_num,
|
|
246
|
+
))
|
|
247
|
+
elif direction == "🌀":
|
|
248
|
+
table_name = extract_table_name(assembled)
|
|
249
|
+
markers.append(FeedMarker(
|
|
250
|
+
line_number=fence_start_line,
|
|
251
|
+
scope=scope,
|
|
252
|
+
raw_query=assembled,
|
|
253
|
+
table_name=table_name,
|
|
254
|
+
direction="🌀",
|
|
255
|
+
end_line_number=line_num,
|
|
256
|
+
))
|
|
257
|
+
state = "SCANNING"
|
|
258
|
+
else:
|
|
259
|
+
sql_lines.append(line)
|
|
260
|
+
|
|
261
|
+
elif state == "SKIP_CODEBLOCK":
|
|
262
|
+
if _is_closing_fence(line, fence_char, fence_length):
|
|
263
|
+
state = "SCANNING"
|
|
264
|
+
|
|
265
|
+
# Handle unclosed codeblock at EOF
|
|
266
|
+
if state == "IN_CODEBLOCK":
|
|
267
|
+
print(
|
|
268
|
+
f"Warning: Unclosed codeblock at line {fence_start_line}, skipping",
|
|
269
|
+
file=sys.stderr,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return markers
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def parse_markdown_table(lines: list[str], start_index: int) -> MarkdownTable | None:
|
|
276
|
+
"""Parse a markdown table starting from a given line index.
|
|
277
|
+
|
|
278
|
+
The line at start_index must be a valid pipe-delimited header row.
|
|
279
|
+
The next line must be a valid separator row.
|
|
280
|
+
Returns MarkdownTable or None if no valid table found.
|
|
281
|
+
"""
|
|
282
|
+
if start_index >= len(lines):
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
header_line = lines[start_index]
|
|
286
|
+
if '|' not in header_line:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
# Parse header
|
|
290
|
+
columns = _parse_pipe_row(header_line)
|
|
291
|
+
if not columns:
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
# Check separator row
|
|
295
|
+
sep_index = start_index + 1
|
|
296
|
+
if sep_index >= len(lines):
|
|
297
|
+
return None
|
|
298
|
+
|
|
299
|
+
sep_line = lines[sep_index]
|
|
300
|
+
if not _is_separator_row(sep_line):
|
|
301
|
+
return None
|
|
302
|
+
|
|
303
|
+
# Parse data rows
|
|
304
|
+
rows = []
|
|
305
|
+
i = sep_index + 1
|
|
306
|
+
while i < len(lines):
|
|
307
|
+
line = lines[i]
|
|
308
|
+
if '|' not in line:
|
|
309
|
+
break
|
|
310
|
+
cells = _parse_pipe_row(line)
|
|
311
|
+
# Pad or truncate to match column count
|
|
312
|
+
while len(cells) < len(columns):
|
|
313
|
+
cells.append("")
|
|
314
|
+
cells = cells[:len(columns)]
|
|
315
|
+
rows.append(cells)
|
|
316
|
+
i += 1
|
|
317
|
+
|
|
318
|
+
start_line = start_index + 1 # 1-based
|
|
319
|
+
end_line = (i if rows else sep_index + 1) # 1-based
|
|
320
|
+
|
|
321
|
+
return MarkdownTable(
|
|
322
|
+
columns=columns,
|
|
323
|
+
rows=rows,
|
|
324
|
+
start_line=start_line,
|
|
325
|
+
end_line=end_line,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _parse_pipe_row(line: str) -> list[str]:
|
|
330
|
+
"""Parse a pipe-delimited row, returning trimmed cell values."""
|
|
331
|
+
stripped = line.strip()
|
|
332
|
+
if stripped.startswith('|'):
|
|
333
|
+
stripped = stripped[1:]
|
|
334
|
+
if stripped.endswith('|'):
|
|
335
|
+
stripped = stripped[:-1]
|
|
336
|
+
return [cell.strip() for cell in stripped.split('|')]
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _is_separator_row(line: str) -> bool:
|
|
340
|
+
"""Check if a line is a valid markdown table separator row."""
|
|
341
|
+
stripped = line.strip()
|
|
342
|
+
if '|' not in stripped:
|
|
343
|
+
return False
|
|
344
|
+
cells = _parse_pipe_row(stripped)
|
|
345
|
+
for cell in cells:
|
|
346
|
+
cleaned = cell.strip().strip(':')
|
|
347
|
+
if not cleaned or not all(c == '-' for c in cleaned):
|
|
348
|
+
return False
|
|
349
|
+
return True
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def find_tap_markers(content: str) -> list[TapMarker]:
|
|
353
|
+
"""Scan markdown content for tap markers (💎 TapMarker).
|
|
354
|
+
|
|
355
|
+
Tap markers execute a query and render results as the table below.
|
|
356
|
+
Table is optional -- auto-generated from DB if absent.
|
|
357
|
+
Matches both single-line backtick and fenced codeblock patterns.
|
|
358
|
+
Returns markers in document order.
|
|
359
|
+
"""
|
|
360
|
+
return [m for m in _find_all_markers(content) if isinstance(m, TapMarker)]
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def find_feed_markers(content: str) -> list[FeedMarker]:
|
|
364
|
+
"""Scan markdown content for feed markers (🌀 FeedMarker).
|
|
365
|
+
|
|
366
|
+
Feed markers include the table below as part of the dataset.
|
|
367
|
+
Table is required -- defines column schema (source of truth).
|
|
368
|
+
Matches both single-line backtick and fenced codeblock patterns.
|
|
369
|
+
Returns markers in document order.
|
|
370
|
+
"""
|
|
371
|
+
return [m for m in _find_all_markers(content) if isinstance(m, FeedMarker)]
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def find_adjacent_table(lines, marker_line_number, end_line_number=None):
|
|
375
|
+
"""Find the markdown table adjacent to a marker.
|
|
376
|
+
|
|
377
|
+
Searches forward from the marker (or closing fence) for the first
|
|
378
|
+
non-blank line and attempts to parse it as a markdown table.
|
|
379
|
+
Returns (MarkdownTable, line_index) or (None, None).
|
|
380
|
+
"""
|
|
381
|
+
search_after = end_line_number if end_line_number else marker_line_number
|
|
382
|
+
start = search_after # 0-based index of line after marker/closing fence
|
|
383
|
+
for i in range(start, len(lines)):
|
|
384
|
+
if lines[i].strip():
|
|
385
|
+
table = parse_markdown_table(lines, i)
|
|
386
|
+
return (table, i) if table is not None else (None, None)
|
|
387
|
+
return (None, None)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# ---------------------------------------------------------------------------
|
|
391
|
+
# Type annotation parsing (consolidated from type_parser.py)
|
|
392
|
+
# ---------------------------------------------------------------------------
|
|
393
|
+
|
|
394
|
+
VALID_TYPES: frozenset[str] = frozenset({"TEXT", "INTEGER", "REAL", "NUMERIC", "BLOB"})
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def parse_header_types(header_cells: list[str]) -> tuple[list[str], list[str], list[str], list[str]]:
|
|
398
|
+
"""Parse type annotations from markdown table header cells.
|
|
399
|
+
|
|
400
|
+
Uses last-colon delimiter to split name from type.
|
|
401
|
+
Returns (column_names, column_types, warnings, errors).
|
|
402
|
+
"""
|
|
403
|
+
column_names: list[str] = []
|
|
404
|
+
column_types: list[str] = []
|
|
405
|
+
warnings: list[str] = []
|
|
406
|
+
errors: list[str] = []
|
|
407
|
+
|
|
408
|
+
for cell in header_cells:
|
|
409
|
+
cell = cell.strip()
|
|
410
|
+
colon_idx = cell.rfind(":")
|
|
411
|
+
if colon_idx == -1:
|
|
412
|
+
# No colon: whole cell is the name, type defaults to TEXT
|
|
413
|
+
column_names.append(cell)
|
|
414
|
+
column_types.append("TEXT")
|
|
415
|
+
else:
|
|
416
|
+
name = cell[:colon_idx].strip()
|
|
417
|
+
type_str = cell[colon_idx + 1:].strip()
|
|
418
|
+
if not type_str:
|
|
419
|
+
# Empty type annotation (e.g., "id:")
|
|
420
|
+
column_names.append(name)
|
|
421
|
+
column_types.append("TEXT")
|
|
422
|
+
warnings.append(f"Empty type annotation on column '{name}'")
|
|
423
|
+
elif type_str.upper() in VALID_TYPES:
|
|
424
|
+
column_names.append(name)
|
|
425
|
+
column_types.append(type_str.upper())
|
|
426
|
+
else:
|
|
427
|
+
# Invalid type
|
|
428
|
+
column_names.append(name)
|
|
429
|
+
column_types.append("TEXT")
|
|
430
|
+
errors.append(f"Invalid type '{type_str}' on column '{name}'")
|
|
431
|
+
|
|
432
|
+
return column_names, column_types, warnings, errors
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def parse_header_types_with_annotation_flags(
|
|
436
|
+
header_cells: list[str],
|
|
437
|
+
) -> tuple[list[str], list[str], list[bool], list[str], list[str]]:
|
|
438
|
+
"""Extended version that also returns which columns were explicitly annotated.
|
|
439
|
+
|
|
440
|
+
Returns (column_names, column_types, was_annotated, warnings, errors).
|
|
441
|
+
"""
|
|
442
|
+
column_names: list[str] = []
|
|
443
|
+
column_types: list[str] = []
|
|
444
|
+
was_annotated: list[bool] = []
|
|
445
|
+
warnings: list[str] = []
|
|
446
|
+
errors: list[str] = []
|
|
447
|
+
|
|
448
|
+
for cell in header_cells:
|
|
449
|
+
cell = cell.strip()
|
|
450
|
+
colon_idx = cell.rfind(":")
|
|
451
|
+
if colon_idx == -1:
|
|
452
|
+
column_names.append(cell)
|
|
453
|
+
column_types.append("TEXT")
|
|
454
|
+
was_annotated.append(False)
|
|
455
|
+
else:
|
|
456
|
+
name = cell[:colon_idx].strip()
|
|
457
|
+
type_str = cell[colon_idx + 1:].strip()
|
|
458
|
+
if not type_str:
|
|
459
|
+
column_names.append(name)
|
|
460
|
+
column_types.append("TEXT")
|
|
461
|
+
was_annotated.append(False)
|
|
462
|
+
warnings.append(f"Empty type annotation on column '{name}'")
|
|
463
|
+
elif type_str.upper() in VALID_TYPES:
|
|
464
|
+
column_names.append(name)
|
|
465
|
+
column_types.append(type_str.upper())
|
|
466
|
+
was_annotated.append(True)
|
|
467
|
+
else:
|
|
468
|
+
column_names.append(name)
|
|
469
|
+
column_types.append("TEXT")
|
|
470
|
+
was_annotated.append(False)
|
|
471
|
+
errors.append(f"Invalid type '{type_str}' on column '{name}'")
|
|
472
|
+
|
|
473
|
+
return column_names, column_types, was_annotated, warnings, errors
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
# Regex to match cast(... as TYPE) - case insensitive
|
|
477
|
+
_CAST_RE = re.compile(r"cast\s*\(\s*.+?\s+as\s+(\w+)\s*\)", re.IGNORECASE)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def parse_cast_types(raw_query: str) -> tuple[dict[int, str], list[str]]:
|
|
481
|
+
"""Parse cast(col as type) expressions from a query marker's SELECT list.
|
|
482
|
+
|
|
483
|
+
Returns (cast_types, errors) where cast_types maps 0-based position to type.
|
|
484
|
+
"""
|
|
485
|
+
cast_types: dict[int, str] = {}
|
|
486
|
+
errors: list[str] = []
|
|
487
|
+
|
|
488
|
+
query_upper = raw_query.strip().upper()
|
|
489
|
+
if query_upper.startswith("SELECT") and "*" in raw_query.split("FROM", 1)[0] if "FROM" in raw_query.upper() else True:
|
|
490
|
+
# Check for select * (no individual columns)
|
|
491
|
+
select_part = raw_query.strip()
|
|
492
|
+
# Extract portion between SELECT and FROM
|
|
493
|
+
select_match = re.match(r"select\s+(.*?)\s+from\s+", select_part, re.IGNORECASE | re.DOTALL)
|
|
494
|
+
if select_match is None:
|
|
495
|
+
return cast_types, errors
|
|
496
|
+
select_list = select_match.group(1).strip()
|
|
497
|
+
if select_list == "*":
|
|
498
|
+
return cast_types, errors
|
|
499
|
+
else:
|
|
500
|
+
select_match = re.match(r"select\s+(.*?)\s+from\s+", raw_query.strip(), re.IGNORECASE | re.DOTALL)
|
|
501
|
+
if select_match is None:
|
|
502
|
+
return cast_types, errors
|
|
503
|
+
select_list = select_match.group(1).strip()
|
|
504
|
+
|
|
505
|
+
# Split by commas, respecting parentheses
|
|
506
|
+
columns_expr = _split_select_list(select_list)
|
|
507
|
+
|
|
508
|
+
for i, expr in enumerate(columns_expr):
|
|
509
|
+
expr = expr.strip()
|
|
510
|
+
cast_match = _CAST_RE.match(expr)
|
|
511
|
+
if cast_match:
|
|
512
|
+
type_str = cast_match.group(1).strip()
|
|
513
|
+
if type_str.upper() in VALID_TYPES:
|
|
514
|
+
cast_types[i] = type_str.upper()
|
|
515
|
+
else:
|
|
516
|
+
errors.append(f"Invalid type '{type_str}' in cast at position {i}")
|
|
517
|
+
|
|
518
|
+
return cast_types, errors
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def _split_select_list(select_list: str) -> list[str]:
|
|
522
|
+
"""Split a SELECT column list by commas, respecting parentheses nesting."""
|
|
523
|
+
parts: list[str] = []
|
|
524
|
+
current: list[str] = []
|
|
525
|
+
depth = 0
|
|
526
|
+
for ch in select_list:
|
|
527
|
+
if ch == "(":
|
|
528
|
+
depth += 1
|
|
529
|
+
current.append(ch)
|
|
530
|
+
elif ch == ")":
|
|
531
|
+
depth -= 1
|
|
532
|
+
current.append(ch)
|
|
533
|
+
elif ch == "," and depth == 0:
|
|
534
|
+
parts.append("".join(current))
|
|
535
|
+
current = []
|
|
536
|
+
else:
|
|
537
|
+
current.append(ch)
|
|
538
|
+
if current:
|
|
539
|
+
parts.append("".join(current))
|
|
540
|
+
return parts
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def resolve_column_types(
|
|
544
|
+
header_types: list[str],
|
|
545
|
+
cast_types: dict[int, str],
|
|
546
|
+
num_columns: int,
|
|
547
|
+
was_annotated: list[bool] | None = None,
|
|
548
|
+
) -> list[str]:
|
|
549
|
+
"""Merge header and cast type sources with header precedence.
|
|
550
|
+
|
|
551
|
+
For each column position:
|
|
552
|
+
- If header was explicitly annotated: use header type
|
|
553
|
+
- Else if position in cast_types: use cast type
|
|
554
|
+
- Else: TEXT
|
|
555
|
+
"""
|
|
556
|
+
result: list[str] = []
|
|
557
|
+
for i in range(num_columns):
|
|
558
|
+
if was_annotated is not None and i < len(was_annotated) and was_annotated[i]:
|
|
559
|
+
# Explicitly annotated in header -- header wins
|
|
560
|
+
result.append(header_types[i] if i < len(header_types) else "TEXT")
|
|
561
|
+
elif i in cast_types:
|
|
562
|
+
result.append(cast_types[i])
|
|
563
|
+
elif i < len(header_types):
|
|
564
|
+
result.append(header_types[i])
|
|
565
|
+
else:
|
|
566
|
+
result.append("TEXT")
|
|
567
|
+
return result
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
# ---------------------------------------------------------------------------
|
|
571
|
+
# Post-pull-tap-query parser (027)
|
|
572
|
+
# ---------------------------------------------------------------------------
|
|
573
|
+
|
|
574
|
+
SCOPE_DELIMITER = " \U0001f48e " # " 💎 "
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def parse_post_pull_tap_query(arg: str) -> tuple[str, str]:
|
|
578
|
+
"""Parse post-pull tap query argument into (scope_name, sql_query).
|
|
579
|
+
|
|
580
|
+
If arg contains ' 💎 ', split on first occurrence.
|
|
581
|
+
Otherwise, scope_name is empty string.
|
|
582
|
+
"""
|
|
583
|
+
if SCOPE_DELIMITER in arg:
|
|
584
|
+
idx = arg.index(SCOPE_DELIMITER)
|
|
585
|
+
scope_name = arg[:idx].strip()
|
|
586
|
+
sql_query = arg[idx + len(SCOPE_DELIMITER):].strip()
|
|
587
|
+
return (scope_name, sql_query)
|
|
588
|
+
return ("", arg.strip())
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
# ---------------------------------------------------------------------------
|
|
592
|
+
# Pre-push-feed-query parser (028)
|
|
593
|
+
# ---------------------------------------------------------------------------
|
|
594
|
+
|
|
595
|
+
PUSH_SCOPE_DELIMITER = " \U0001f300 " # " 🌀 "
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def parse_pre_push_feed_query(arg: str) -> tuple[str, str]:
|
|
599
|
+
"""Parse pre-push feed query argument into (scope_name, sql_query).
|
|
600
|
+
|
|
601
|
+
If arg contains ' 🌀 ', split on first occurrence.
|
|
602
|
+
Otherwise, scope_name is empty string.
|
|
603
|
+
"""
|
|
604
|
+
if PUSH_SCOPE_DELIMITER in arg:
|
|
605
|
+
idx = arg.index(PUSH_SCOPE_DELIMITER)
|
|
606
|
+
scope_name = arg[:idx].strip()
|
|
607
|
+
sql_query = arg[idx + len(PUSH_SCOPE_DELIMITER):].strip()
|
|
608
|
+
return (scope_name, sql_query)
|
|
609
|
+
return ("", arg.strip())
|