markback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
markback/parser.py ADDED
@@ -0,0 +1,587 @@
1
+ """MarkBack parser implementation."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ from urllib.parse import urlparse
7
+
8
+ from .types import (
9
+ Diagnostic,
10
+ ErrorCode,
11
+ ParseResult,
12
+ Record,
13
+ Severity,
14
+ SourceRef,
15
+ WarningCode,
16
+ )
17
+
18
+
19
+ # Known header keywords
20
+ KNOWN_HEADERS = {"uri", "source"}
21
+
22
+ # Patterns
23
+ HEADER_PATTERN = re.compile(r"^@([a-z]+)\s+(.+)$")
24
+ FEEDBACK_DELIMITER = "<<<"
25
+ RECORD_SEPARATOR = "---"
26
+ COMPACT_PATTERN = re.compile(r"^@source\s+(.+?)\s+<<<\s+(.*)$")
27
+
28
+
29
+ class LineType:
30
+ """Line classification types."""
31
+ COMPACT_RECORD = "compact_record"
32
+ HEADER = "header"
33
+ FEEDBACK = "feedback"
34
+ SEPARATOR = "separator"
35
+ BLANK = "blank"
36
+ CONTENT = "content"
37
+
38
+
39
+ def classify_line(line: str) -> str:
40
+ """Classify a line according to MarkBack grammar."""
41
+ stripped = line.rstrip()
42
+
43
+ # Blank line
44
+ if not stripped:
45
+ return LineType.BLANK
46
+
47
+ # Record separator
48
+ if stripped == RECORD_SEPARATOR:
49
+ return LineType.SEPARATOR
50
+
51
+ # Compact record: @source ... <<<
52
+ if stripped.startswith("@source") and FEEDBACK_DELIMITER in stripped:
53
+ return LineType.COMPACT_RECORD
54
+
55
+ # Header: @keyword value
56
+ if stripped.startswith("@"):
57
+ return LineType.HEADER
58
+
59
+ # Feedback delimiter
60
+ if stripped.startswith(FEEDBACK_DELIMITER):
61
+ return LineType.FEEDBACK
62
+
63
+ # Everything else is content
64
+ return LineType.CONTENT
65
+
66
+
67
+ def parse_header(line: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
68
+ """Parse a header line. Returns (keyword, value, error_message)."""
69
+ stripped = line.rstrip()
70
+ match = HEADER_PATTERN.match(stripped)
71
+ if not match:
72
+ return None, None, f"Malformed header syntax: {stripped}"
73
+ return match.group(1), match.group(2), None
74
+
75
+
76
+ def validate_uri(uri: str) -> Optional[str]:
77
+ """Validate a URI. Returns error message if invalid."""
78
+ try:
79
+ result = urlparse(uri)
80
+ # Must have a scheme
81
+ if not result.scheme:
82
+ return f"URI missing scheme: {uri}"
83
+ return None
84
+ except Exception as e:
85
+ return f"Invalid URI: {uri} ({e})"
86
+
87
+
88
+ def parse_compact_record(line: str) -> tuple[Optional[SourceRef], Optional[str], Optional[str]]:
89
+ """Parse a compact record line. Returns (source, feedback, error_message)."""
90
+ match = COMPACT_PATTERN.match(line.rstrip())
91
+ if not match:
92
+ return None, None, f"Invalid compact record syntax: {line}"
93
+
94
+ source_path = match.group(1)
95
+ feedback = match.group(2)
96
+
97
+ return SourceRef(source_path), feedback, None
98
+
99
+
100
+ def parse_string(
101
+ text: str,
102
+ source_file: Optional[Path] = None,
103
+ ) -> ParseResult:
104
+ """Parse a MarkBack string into records.
105
+
106
+ Handles single-record, multi-record, and compact formats.
107
+ """
108
+ lines = text.split('\n')
109
+ # Remove trailing empty line if present (from final newline)
110
+ if lines and lines[-1] == '':
111
+ lines = lines[:-1]
112
+
113
+ records: list[Record] = []
114
+ diagnostics: list[Diagnostic] = []
115
+
116
+ def add_diagnostic(
117
+ severity: Severity,
118
+ code: ErrorCode | WarningCode,
119
+ message: str,
120
+ line_num: Optional[int] = None,
121
+ col: Optional[int] = None,
122
+ record_idx: Optional[int] = None,
123
+ ):
124
+ diagnostics.append(Diagnostic(
125
+ file=source_file,
126
+ line=line_num,
127
+ column=col,
128
+ severity=severity,
129
+ code=code,
130
+ message=message,
131
+ record_index=record_idx,
132
+ ))
133
+
134
+ # State for parsing
135
+ current_headers: dict[str, str] = {}
136
+ current_content_lines: list[str] = []
137
+ current_start_line: int = 1
138
+ pending_uri: Optional[str] = None # For compact records with preceding @uri
139
+ in_content: bool = False
140
+ had_blank_line: bool = False
141
+
142
+ def finalize_record(feedback: str, end_line: int, is_compact: bool = False):
143
+ """Create a record from current state."""
144
+ nonlocal current_headers, current_content_lines, current_start_line
145
+ nonlocal pending_uri, in_content, had_blank_line
146
+
147
+ uri = current_headers.get("uri") or pending_uri
148
+ source_str = current_headers.get("source")
149
+ source = SourceRef(source_str) if source_str else None
150
+
151
+ content = None
152
+ if current_content_lines:
153
+ content = '\n'.join(current_content_lines)
154
+ # Trim leading/trailing blank lines from content
155
+ content_lines = content.split('\n')
156
+ while content_lines and not content_lines[0].strip():
157
+ content_lines.pop(0)
158
+ while content_lines and not content_lines[-1].strip():
159
+ content_lines.pop()
160
+ content = '\n'.join(content_lines) if content_lines else None
161
+
162
+ record = Record(
163
+ feedback=feedback,
164
+ uri=uri,
165
+ source=source,
166
+ content=content,
167
+ _source_file=source_file,
168
+ _start_line=current_start_line,
169
+ _end_line=end_line,
170
+ _is_compact=is_compact,
171
+ )
172
+ records.append(record)
173
+
174
+ # Reset state
175
+ current_headers = {}
176
+ current_content_lines = []
177
+ current_start_line = end_line + 1
178
+ pending_uri = None
179
+ in_content = False
180
+ had_blank_line = False
181
+
182
+ line_num = 0
183
+ while line_num < len(lines):
184
+ line = lines[line_num]
185
+ line_num += 1 # 1-indexed for diagnostics
186
+ line_type = classify_line(line)
187
+
188
+ # Check for trailing whitespace
189
+ if line.rstrip() != line.rstrip('\n'):
190
+ if line != line.rstrip():
191
+ add_diagnostic(
192
+ Severity.WARNING,
193
+ WarningCode.W004,
194
+ "Trailing whitespace",
195
+ line_num,
196
+ )
197
+
198
+ if line_type == LineType.SEPARATOR:
199
+ # Record separator - finalize any pending record
200
+ if current_headers or current_content_lines:
201
+ # Missing feedback
202
+ add_diagnostic(
203
+ Severity.ERROR,
204
+ ErrorCode.E001,
205
+ "Missing feedback (no <<< delimiter found)",
206
+ current_start_line,
207
+ record_idx=len(records),
208
+ )
209
+ current_start_line = line_num + 1
210
+ pending_uri = None
211
+ in_content = False
212
+ had_blank_line = False
213
+ continue
214
+
215
+ if line_type == LineType.BLANK:
216
+ if current_headers and not in_content:
217
+ had_blank_line = True
218
+ elif in_content:
219
+ current_content_lines.append("")
220
+ continue
221
+
222
+ if line_type == LineType.COMPACT_RECORD:
223
+ # Compact record: @source ... <<<
224
+ source, feedback, error = parse_compact_record(line)
225
+ if error:
226
+ add_diagnostic(
227
+ Severity.ERROR,
228
+ ErrorCode.E006,
229
+ error,
230
+ line_num,
231
+ )
232
+ continue
233
+
234
+ if feedback is not None and not feedback:
235
+ add_diagnostic(
236
+ Severity.ERROR,
237
+ ErrorCode.E009,
238
+ "Empty feedback (nothing after <<< )",
239
+ line_num,
240
+ )
241
+
242
+ # Use any pending @uri from previous line
243
+ uri = pending_uri or current_headers.get("uri")
244
+
245
+ record = Record(
246
+ feedback=feedback or "",
247
+ uri=uri,
248
+ source=source,
249
+ content=None,
250
+ _source_file=source_file,
251
+ _start_line=current_start_line,
252
+ _end_line=line_num,
253
+ _is_compact=True,
254
+ )
255
+ records.append(record)
256
+
257
+ # Reset state
258
+ current_headers = {}
259
+ current_content_lines = []
260
+ current_start_line = line_num + 1
261
+ pending_uri = None
262
+ in_content = False
263
+ had_blank_line = False
264
+ continue
265
+
266
+ if line_type == LineType.HEADER:
267
+ # If we've seen a blank line, treat @-starting lines as content
268
+ # (content that starts with @ requires the blank line separator)
269
+ if had_blank_line or in_content:
270
+ in_content = True
271
+ current_content_lines.append(line)
272
+ continue
273
+
274
+ keyword, value, error = parse_header(line)
275
+ if error:
276
+ add_diagnostic(
277
+ Severity.ERROR,
278
+ ErrorCode.E006,
279
+ error,
280
+ line_num,
281
+ )
282
+ continue
283
+
284
+ if keyword not in KNOWN_HEADERS:
285
+ add_diagnostic(
286
+ Severity.WARNING,
287
+ WarningCode.W002,
288
+ f"Unknown header keyword: @{keyword}",
289
+ line_num,
290
+ )
291
+
292
+ if keyword == "uri":
293
+ uri_error = validate_uri(value)
294
+ if uri_error:
295
+ add_diagnostic(
296
+ Severity.ERROR,
297
+ ErrorCode.E003,
298
+ uri_error,
299
+ line_num,
300
+ )
301
+ # Check if next non-blank line is compact record
302
+ # Store as pending_uri for potential compact record
303
+ pending_uri = value
304
+
305
+ current_headers[keyword] = value
306
+ continue
307
+
308
+ if line_type == LineType.FEEDBACK:
309
+ # Extract feedback content
310
+ stripped = line.rstrip()
311
+ if stripped == FEEDBACK_DELIMITER:
312
+ add_diagnostic(
313
+ Severity.ERROR,
314
+ ErrorCode.E009,
315
+ "Empty feedback (nothing after <<< )",
316
+ line_num,
317
+ )
318
+ feedback = ""
319
+ elif stripped.startswith(FEEDBACK_DELIMITER + " "):
320
+ feedback = stripped[len(FEEDBACK_DELIMITER) + 1:]
321
+ else:
322
+ # <<< with content but no space - try to parse anyway
323
+ feedback = stripped[len(FEEDBACK_DELIMITER):].lstrip()
324
+
325
+ # Check for content when @source is present
326
+ if current_headers.get("source") and current_content_lines:
327
+ content_text = '\n'.join(current_content_lines).strip()
328
+ if content_text:
329
+ add_diagnostic(
330
+ Severity.ERROR,
331
+ ErrorCode.E005,
332
+ "Content present when @source specified",
333
+ current_start_line,
334
+ record_idx=len(records),
335
+ )
336
+
337
+ # Check for missing blank line before content that starts with @
338
+ if current_content_lines and not had_blank_line:
339
+ first_content = current_content_lines[0] if current_content_lines else ""
340
+ if first_content.startswith("@"):
341
+ add_diagnostic(
342
+ Severity.ERROR,
343
+ ErrorCode.E010,
344
+ "Missing blank line before inline content (content starts with @)",
345
+ current_start_line,
346
+ record_idx=len(records),
347
+ )
348
+
349
+ finalize_record(feedback, line_num)
350
+ continue
351
+
352
+ if line_type == LineType.CONTENT:
353
+ in_content = True
354
+ current_content_lines.append(line)
355
+ continue
356
+
357
+ # Check for unterminated record at end of file
358
+ if current_headers or current_content_lines:
359
+ add_diagnostic(
360
+ Severity.ERROR,
361
+ ErrorCode.E001,
362
+ "Missing feedback (no <<< delimiter found)",
363
+ current_start_line,
364
+ record_idx=len(records),
365
+ )
366
+
367
+ # Check for duplicate URIs
368
+ seen_uris: dict[str, int] = {}
369
+ for idx, record in enumerate(records):
370
+ if record.uri:
371
+ if record.uri in seen_uris:
372
+ add_diagnostic(
373
+ Severity.WARNING,
374
+ WarningCode.W001,
375
+ f"Duplicate URI: {record.uri} (first seen in record {seen_uris[record.uri]})",
376
+ record._start_line,
377
+ record_idx=idx,
378
+ )
379
+ else:
380
+ seen_uris[record.uri] = idx
381
+
382
+ # Check for missing URIs
383
+ for idx, record in enumerate(records):
384
+ if not record.uri:
385
+ add_diagnostic(
386
+ Severity.WARNING,
387
+ WarningCode.W006,
388
+ "Missing @uri (record has no identifier)",
389
+ record._start_line,
390
+ record_idx=idx,
391
+ )
392
+
393
+ return ParseResult(
394
+ records=records,
395
+ diagnostics=diagnostics,
396
+ source_file=source_file,
397
+ )
398
+
399
+
400
+ def parse_file(path: Path) -> ParseResult:
401
+ """Parse a MarkBack file."""
402
+ try:
403
+ text = path.read_text(encoding="utf-8")
404
+ except UnicodeDecodeError:
405
+ return ParseResult(
406
+ records=[],
407
+ diagnostics=[
408
+ Diagnostic(
409
+ file=path,
410
+ line=None,
411
+ column=None,
412
+ severity=Severity.ERROR,
413
+ code=ErrorCode.E006,
414
+ message="File is not valid UTF-8",
415
+ )
416
+ ],
417
+ source_file=path,
418
+ )
419
+
420
+ return parse_string(text, source_file=path)
421
+
422
+
423
+ def discover_paired_files(
424
+ directory: Path,
425
+ content_patterns: Optional[list[str]] = None,
426
+ label_suffixes: Optional[list[str]] = None,
427
+ ) -> list[tuple[Path, Optional[Path]]]:
428
+ """Discover content files and their paired label files.
429
+
430
+ Returns list of (content_file, label_file) tuples.
431
+ label_file may be None if not found.
432
+ """
433
+ if label_suffixes is None:
434
+ label_suffixes = [".label.txt", ".feedback.txt", ".mb"]
435
+
436
+ if content_patterns is None:
437
+ content_patterns = ["*"]
438
+
439
+ pairs: list[tuple[Path, Optional[Path]]] = []
440
+
441
+ # Find all files in directory
442
+ all_files = set(directory.iterdir()) if directory.is_dir() else set()
443
+
444
+ # Identify label files
445
+ label_files = set()
446
+ for f in all_files:
447
+ for suffix in label_suffixes:
448
+ if f.name.endswith(suffix):
449
+ label_files.add(f)
450
+ break
451
+
452
+ # Content files are everything else (excluding label files and hidden files)
453
+ content_files = [
454
+ f for f in all_files
455
+ if f.is_file()
456
+ and f not in label_files
457
+ and not f.name.startswith(".")
458
+ ]
459
+
460
+ for content_file in content_files:
461
+ # Look for corresponding label file
462
+ label_file = None
463
+ basename = content_file.stem # filename without extension
464
+
465
+ for suffix in label_suffixes:
466
+ candidate = directory / (basename + suffix)
467
+ if candidate.exists():
468
+ label_file = candidate
469
+ break
470
+
471
+ # Also try with full name (for extensionless files)
472
+ candidate = directory / (content_file.name + suffix)
473
+ if candidate.exists():
474
+ label_file = candidate
475
+ break
476
+
477
+ pairs.append((content_file, label_file))
478
+
479
+ return pairs
480
+
481
+
482
+ def parse_paired_files(
483
+ content_file: Path,
484
+ label_file: Path,
485
+ ) -> ParseResult:
486
+ """Parse a paired content + label file combination."""
487
+ diagnostics: list[Diagnostic] = []
488
+
489
+ # Parse the label file
490
+ label_result = parse_file(label_file)
491
+ diagnostics.extend(label_result.diagnostics)
492
+
493
+ if not label_result.records:
494
+ return ParseResult(
495
+ records=[],
496
+ diagnostics=diagnostics,
497
+ source_file=label_file,
498
+ )
499
+
500
+ # In paired mode, the label file should have exactly one record
501
+ # with no inline content (content comes from the paired file)
502
+ record = label_result.records[0]
503
+
504
+ # Set the source to the content file
505
+ if record.source is None:
506
+ record.source = SourceRef(str(content_file))
507
+
508
+ # The content should come from the content file
509
+ if record.content:
510
+ diagnostics.append(Diagnostic(
511
+ file=label_file,
512
+ line=record._start_line,
513
+ column=None,
514
+ severity=Severity.WARNING,
515
+ code=WarningCode.W008,
516
+ message="Paired label file should not contain inline content",
517
+ ))
518
+
519
+ # Load content from content file if it's text
520
+ try:
521
+ record.content = content_file.read_text(encoding="utf-8")
522
+ except UnicodeDecodeError:
523
+ # Binary file - leave content as None, source points to file
524
+ record.content = None
525
+
526
+ record._source_file = label_file
527
+
528
+ return ParseResult(
529
+ records=[record],
530
+ diagnostics=diagnostics,
531
+ source_file=label_file,
532
+ )
533
+
534
+
535
+ def parse_directory(
536
+ directory: Path,
537
+ label_suffixes: Optional[list[str]] = None,
538
+ recursive: bool = False,
539
+ ) -> ParseResult:
540
+ """Parse all MarkBack files in a directory.
541
+
542
+ Handles both standalone .mb files and paired file mode.
543
+ """
544
+ if label_suffixes is None:
545
+ label_suffixes = [".label.txt", ".feedback.txt", ".mb"]
546
+
547
+ all_records: list[Record] = []
548
+ all_diagnostics: list[Diagnostic] = []
549
+
550
+ # Find all .mb files (standalone MarkBack files)
551
+ mb_files = list(directory.glob("**/*.mb" if recursive else "*.mb"))
552
+
553
+ for mb_file in mb_files:
554
+ # Check if this is a label file (paired mode)
555
+ is_label_file = False
556
+ for suffix in label_suffixes:
557
+ if mb_file.name.endswith(suffix) and suffix != ".mb":
558
+ is_label_file = True
559
+ break
560
+
561
+ if not is_label_file:
562
+ result = parse_file(mb_file)
563
+ all_records.extend(result.records)
564
+ all_diagnostics.extend(result.diagnostics)
565
+
566
+ # Find paired files
567
+ pairs = discover_paired_files(directory, label_suffixes=label_suffixes)
568
+ for content_file, label_file in pairs:
569
+ if label_file:
570
+ result = parse_paired_files(content_file, label_file)
571
+ all_records.extend(result.records)
572
+ all_diagnostics.extend(result.diagnostics)
573
+ else:
574
+ all_diagnostics.append(Diagnostic(
575
+ file=content_file,
576
+ line=None,
577
+ column=None,
578
+ severity=Severity.WARNING,
579
+ code=WarningCode.W007,
580
+ message=f"Paired feedback file not found for {content_file.name}",
581
+ ))
582
+
583
+ return ParseResult(
584
+ records=all_records,
585
+ diagnostics=all_diagnostics,
586
+ source_file=directory,
587
+ )