elspais 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
elspais/core/parser.py ADDED
@@ -0,0 +1,596 @@
1
+ """
2
+ elspais.core.parser - Requirement file parsing.
3
+
4
+ Parses Markdown files containing requirements in the standard format.
5
+ """
6
+
7
+ import re
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Sequence, Union
10
+
11
+ from elspais.core.models import Assertion, ParseResult, ParseWarning, Requirement
12
+ from elspais.core.patterns import PatternConfig, PatternValidator
13
+
14
+
15
+ class RequirementParser:
16
+ """
17
+ Parses requirement specifications from Markdown files.
18
+ """
19
+
20
+ # Regex patterns for parsing
21
+ # Generic pattern to find potential requirement headers
22
+ # Actual ID validation is done by PatternValidator
23
+ HEADER_PATTERN = re.compile(
24
+ r"^#*\s*(?P<id>[A-Z]+-[A-Za-z0-9-]+):\s*(?P<title>.+)$"
25
+ )
26
+ LEVEL_STATUS_PATTERN = re.compile(
27
+ r"\*\*Level\*\*:\s*(?P<level>\w+)"
28
+ r"(?:\s*\|\s*\*\*Implements\*\*:\s*(?P<implements>[^|\n]+))?"
29
+ r"(?:\s*\|\s*\*\*Status\*\*:\s*(?P<status>\w+))?"
30
+ )
31
+ ALT_STATUS_PATTERN = re.compile(
32
+ r"\*\*Status\*\*:\s*(?P<status>\w+)"
33
+ )
34
+ IMPLEMENTS_PATTERN = re.compile(
35
+ r"\*\*Implements\*\*:\s*(?P<implements>[^|\n]+)"
36
+ )
37
+ END_MARKER_PATTERN = re.compile(
38
+ r"^\*End\*\s+\*[^*]+\*\s*(?:\|\s*\*\*Hash\*\*:\s*(?P<hash>[a-zA-Z0-9]+))?",
39
+ re.MULTILINE
40
+ )
41
+ RATIONALE_PATTERN = re.compile(
42
+ r"\*\*Rationale\*\*:\s*(.+?)(?=\n\n|\n\*\*|\Z)", re.DOTALL
43
+ )
44
+ ACCEPTANCE_PATTERN = re.compile(
45
+ r"\*\*Acceptance Criteria\*\*:\s*\n((?:\s*-\s*.+\n?)+)", re.MULTILINE
46
+ )
47
+ # Assertions section header (## Assertions or **Assertions**)
48
+ ASSERTIONS_HEADER_PATTERN = re.compile(
49
+ r"^##\s+Assertions\s*$", re.MULTILINE
50
+ )
51
+ # Individual assertion line: "A. The system SHALL..." or "01. ..." etc.
52
+ # Captures: label (any alphanumeric), text (rest of line, may continue)
53
+ ASSERTION_LINE_PATTERN = re.compile(
54
+ r"^\s*([A-Z0-9]+)\.\s+(.+)$", re.MULTILINE
55
+ )
56
+
57
+ # Default values that mean "no references" in Implements field
58
+ DEFAULT_NO_REFERENCE_VALUES = ["-", "null", "none", "x", "X", "N/A", "n/a"]
59
+
60
+ # Default placeholder values that indicate a removed/deprecated assertion
61
+ DEFAULT_PLACEHOLDER_VALUES = [
62
+ "obsolete", "removed", "deprecated", "N/A", "n/a", "-", "reserved"
63
+ ]
64
+
65
+ def __init__(
66
+ self,
67
+ pattern_config: PatternConfig,
68
+ no_reference_values: Optional[List[str]] = None,
69
+ placeholder_values: Optional[List[str]] = None,
70
+ ):
71
+ """
72
+ Initialize parser with pattern configuration.
73
+
74
+ Args:
75
+ pattern_config: Configuration for ID patterns
76
+ no_reference_values: Values in Implements field that mean "no references"
77
+ placeholder_values: Values that indicate removed/deprecated assertions
78
+ """
79
+ self.pattern_config = pattern_config
80
+ self.validator = PatternValidator(pattern_config)
81
+ self.no_reference_values = (
82
+ no_reference_values
83
+ if no_reference_values is not None
84
+ else self.DEFAULT_NO_REFERENCE_VALUES
85
+ )
86
+ self.placeholder_values = (
87
+ placeholder_values
88
+ if placeholder_values is not None
89
+ else self.DEFAULT_PLACEHOLDER_VALUES
90
+ )
91
+
92
+ def parse_text(
93
+ self,
94
+ text: str,
95
+ file_path: Optional[Path] = None,
96
+ subdir: str = "",
97
+ ) -> ParseResult:
98
+ """
99
+ Parse requirements from text.
100
+
101
+ Args:
102
+ text: Markdown text containing requirements
103
+ file_path: Optional source file path for location tracking
104
+ subdir: Subdirectory within spec/ (e.g., "roadmap", "archive", "")
105
+
106
+ Returns:
107
+ ParseResult with requirements dict and warnings list
108
+ """
109
+ requirements: Dict[str, Requirement] = {}
110
+ warnings: List[ParseWarning] = []
111
+ lines = text.split("\n")
112
+
113
+ i = 0
114
+ while i < len(lines):
115
+ line = lines[i]
116
+
117
+ # Look for requirement header
118
+ header_match = self.HEADER_PATTERN.match(line)
119
+ if header_match:
120
+ req_id = header_match.group("id")
121
+
122
+ # Validate ID against configured pattern
123
+ if not self.validator.is_valid(req_id):
124
+ i += 1
125
+ continue
126
+
127
+ title = header_match.group("title").strip()
128
+ start_line = i + 1 # 1-indexed
129
+
130
+ # Find the end of this requirement
131
+ req_lines = [line]
132
+ i += 1
133
+ while i < len(lines):
134
+ req_lines.append(lines[i])
135
+ # Check for end marker or next requirement
136
+ if self.END_MARKER_PATTERN.match(lines[i]):
137
+ i += 1
138
+ # Skip separator line if present
139
+ if i < len(lines) and lines[i].strip() == "---":
140
+ i += 1
141
+ break
142
+ # Check for next valid requirement header
143
+ next_match = self.HEADER_PATTERN.match(lines[i])
144
+ if next_match and self.validator.is_valid(next_match.group("id")):
145
+ # Hit next requirement without end marker
146
+ break
147
+ i += 1
148
+
149
+ # Parse the requirement block
150
+ req_text = "\n".join(req_lines)
151
+ req, block_warnings = self._parse_requirement_block(
152
+ req_id, title, req_text, file_path, start_line, subdir
153
+ )
154
+ warnings.extend(block_warnings)
155
+ if req:
156
+ # Check for duplicate ID
157
+ if req_id in requirements:
158
+ warnings.append(ParseWarning(
159
+ requirement_id=req_id,
160
+ message=f"Duplicate ID ignored (first occurrence at line {requirements[req_id].line_number})",
161
+ file_path=file_path,
162
+ line_number=start_line,
163
+ ))
164
+ else:
165
+ requirements[req_id] = req
166
+ else:
167
+ i += 1
168
+
169
+ return ParseResult(requirements=requirements, warnings=warnings)
170
+
171
+ def parse_file(
172
+ self,
173
+ file_path: Path,
174
+ subdir: str = "",
175
+ ) -> ParseResult:
176
+ """
177
+ Parse requirements from a file.
178
+
179
+ Args:
180
+ file_path: Path to the Markdown file
181
+ subdir: Subdirectory within spec/ (e.g., "roadmap", "archive", "")
182
+
183
+ Returns:
184
+ ParseResult with requirements dict and warnings list
185
+ """
186
+ text = file_path.read_text(encoding="utf-8")
187
+ return self.parse_text(text, file_path, subdir)
188
+
189
+ def parse_directory(
190
+ self,
191
+ directory: Path,
192
+ patterns: Optional[List[str]] = None,
193
+ skip_files: Optional[List[str]] = None,
194
+ subdir: str = "",
195
+ ) -> ParseResult:
196
+ """
197
+ Parse all requirements from a directory.
198
+
199
+ Args:
200
+ directory: Path to the spec directory
201
+ patterns: Optional glob patterns to match files
202
+ skip_files: Optional list of filenames to skip
203
+ subdir: Subdirectory within spec/ (e.g., "roadmap", "archive", "")
204
+
205
+ Returns:
206
+ ParseResult with requirements dict and warnings list
207
+ """
208
+ if patterns is None:
209
+ patterns = ["*.md"]
210
+
211
+ if skip_files is None:
212
+ skip_files = []
213
+
214
+ requirements: Dict[str, Requirement] = {}
215
+ warnings: List[ParseWarning] = []
216
+
217
+ for pattern in patterns:
218
+ for file_path in directory.glob(pattern):
219
+ if file_path.is_file() and file_path.name not in skip_files:
220
+ result = self.parse_file(file_path, subdir)
221
+ # Merge requirements, checking for cross-file duplicates
222
+ for req_id, req in result.requirements.items():
223
+ if req_id in requirements:
224
+ warnings.append(ParseWarning(
225
+ requirement_id=req_id,
226
+ message=f"Duplicate ID ignored (first occurrence in {requirements[req_id].file_path})",
227
+ file_path=file_path,
228
+ line_number=req.line_number,
229
+ ))
230
+ else:
231
+ requirements[req_id] = req
232
+ warnings.extend(result.warnings)
233
+
234
+ return ParseResult(requirements=requirements, warnings=warnings)
235
+
236
+ def parse_directories(
237
+ self,
238
+ directories: Union[str, Path, Sequence[Union[str, Path]]],
239
+ base_path: Optional[Path] = None,
240
+ patterns: Optional[List[str]] = None,
241
+ skip_files: Optional[List[str]] = None,
242
+ ) -> ParseResult:
243
+ """
244
+ Parse all requirements from one or more directories.
245
+
246
+ Does NOT recursively search subdirectories - only the specified directories.
247
+
248
+ Args:
249
+ directories: Single directory path (str/Path) or list of directory paths
250
+ base_path: Base path to resolve relative directories against
251
+ patterns: Optional glob patterns to match files (default: ["*.md"])
252
+ skip_files: Optional list of filenames to skip
253
+
254
+ Returns:
255
+ ParseResult with requirements dict and warnings list
256
+ """
257
+ # Normalize to list
258
+ if isinstance(directories, (str, Path)):
259
+ dir_list = [directories]
260
+ else:
261
+ dir_list = list(directories)
262
+
263
+ if base_path is None:
264
+ base_path = Path.cwd()
265
+
266
+ requirements: Dict[str, Requirement] = {}
267
+ warnings: List[ParseWarning] = []
268
+
269
+ for dir_entry in dir_list:
270
+ if Path(dir_entry).is_absolute():
271
+ dir_path = Path(dir_entry)
272
+ else:
273
+ dir_path = base_path / dir_entry
274
+ if dir_path.exists() and dir_path.is_dir():
275
+ result = self.parse_directory(
276
+ dir_path, patterns=patterns, skip_files=skip_files
277
+ )
278
+ # Merge requirements, checking for cross-directory duplicates
279
+ for req_id, req in result.requirements.items():
280
+ if req_id in requirements:
281
+ warnings.append(ParseWarning(
282
+ requirement_id=req_id,
283
+ message=f"Duplicate ID ignored (first occurrence in {requirements[req_id].file_path})",
284
+ file_path=req.file_path,
285
+ line_number=req.line_number,
286
+ ))
287
+ else:
288
+ requirements[req_id] = req
289
+ warnings.extend(result.warnings)
290
+
291
+ return ParseResult(requirements=requirements, warnings=warnings)
292
+
293
+ def parse_directory_with_subdirs(
294
+ self,
295
+ directory: Path,
296
+ subdirs: Optional[List[str]] = None,
297
+ patterns: Optional[List[str]] = None,
298
+ skip_files: Optional[List[str]] = None,
299
+ ) -> ParseResult:
300
+ """
301
+ Parse requirements from a directory and its subdirectories.
302
+
303
+ Unlike parse_directory, this method:
304
+ - Parses the root directory (with subdir="")
305
+ - Parses each specified subdirectory (with subdir set to the subdir name)
306
+
307
+ Args:
308
+ directory: Path to the spec directory
309
+ subdirs: List of subdirectory names to include (e.g., ["roadmap", "archive"])
310
+ patterns: Optional glob patterns to match files
311
+ skip_files: Optional list of filenames to skip
312
+
313
+ Returns:
314
+ ParseResult with requirements dict and warnings list
315
+ """
316
+ if subdirs is None:
317
+ subdirs = []
318
+
319
+ requirements: Dict[str, Requirement] = {}
320
+ warnings: List[ParseWarning] = []
321
+
322
+ # Parse root directory
323
+ root_result = self.parse_directory(
324
+ directory, patterns=patterns, skip_files=skip_files, subdir=""
325
+ )
326
+ requirements.update(root_result.requirements)
327
+ warnings.extend(root_result.warnings)
328
+
329
+ # Parse each subdirectory
330
+ for subdir_name in subdirs:
331
+ subdir_path = directory / subdir_name
332
+ if subdir_path.exists() and subdir_path.is_dir():
333
+ subdir_result = self.parse_directory(
334
+ subdir_path, patterns=patterns, skip_files=skip_files, subdir=subdir_name
335
+ )
336
+ # Merge requirements, checking for cross-subdir duplicates
337
+ for req_id, req in subdir_result.requirements.items():
338
+ if req_id in requirements:
339
+ warnings.append(ParseWarning(
340
+ requirement_id=req_id,
341
+ message=f"Duplicate ID ignored (first occurrence in {requirements[req_id].file_path})",
342
+ file_path=req.file_path,
343
+ line_number=req.line_number,
344
+ ))
345
+ else:
346
+ requirements[req_id] = req
347
+ warnings.extend(subdir_result.warnings)
348
+
349
+ return ParseResult(requirements=requirements, warnings=warnings)
350
+
351
+ def _parse_requirement_block(
352
+ self,
353
+ req_id: str,
354
+ title: str,
355
+ text: str,
356
+ file_path: Optional[Path],
357
+ line_number: int,
358
+ subdir: str = "",
359
+ ) -> tuple:
360
+ """
361
+ Parse a single requirement block.
362
+
363
+ Args:
364
+ req_id: The requirement ID
365
+ title: The requirement title
366
+ text: The full requirement text block
367
+ file_path: Source file path
368
+ line_number: Starting line number
369
+ subdir: Subdirectory within spec/ (e.g., "roadmap", "archive", "")
370
+
371
+ Returns:
372
+ Tuple of (Requirement or None, List[ParseWarning])
373
+ """
374
+ block_warnings: List[ParseWarning] = []
375
+
376
+ # Extract level, status, and implements from header line
377
+ level = "Unknown"
378
+ status = "Unknown"
379
+ implements_str = ""
380
+
381
+ level_match = self.LEVEL_STATUS_PATTERN.search(text)
382
+ if level_match:
383
+ level = level_match.group("level") or "Unknown"
384
+ implements_str = level_match.group("implements") or ""
385
+ status = level_match.group("status") or "Unknown"
386
+
387
+ # Try alternative status pattern
388
+ if status == "Unknown":
389
+ alt_status_match = self.ALT_STATUS_PATTERN.search(text)
390
+ if alt_status_match:
391
+ status = alt_status_match.group("status")
392
+
393
+ # Try alternative implements pattern
394
+ if not implements_str:
395
+ impl_match = self.IMPLEMENTS_PATTERN.search(text)
396
+ if impl_match:
397
+ implements_str = impl_match.group("implements")
398
+
399
+ # Parse implements list and validate references
400
+ implements = self._parse_implements(implements_str)
401
+ for ref in implements:
402
+ if not self.validator.is_valid(ref):
403
+ block_warnings.append(ParseWarning(
404
+ requirement_id=req_id,
405
+ message=f"Invalid implements reference: {ref}",
406
+ file_path=file_path,
407
+ line_number=line_number,
408
+ ))
409
+
410
+ # Extract body (text between header and acceptance/end)
411
+ body = self._extract_body(text)
412
+
413
+ # Extract rationale
414
+ rationale = None
415
+ rationale_match = self.RATIONALE_PATTERN.search(text)
416
+ if rationale_match:
417
+ rationale = rationale_match.group(1).strip()
418
+
419
+ # Extract acceptance criteria (legacy format)
420
+ acceptance_criteria = []
421
+ acceptance_match = self.ACCEPTANCE_PATTERN.search(text)
422
+ if acceptance_match:
423
+ criteria_text = acceptance_match.group(1)
424
+ acceptance_criteria = [
425
+ line.strip().lstrip("- ").strip()
426
+ for line in criteria_text.split("\n")
427
+ if line.strip().startswith("-")
428
+ ]
429
+
430
+ # Extract assertions (new format) and validate labels
431
+ assertions = self._extract_assertions(text)
432
+ for assertion in assertions:
433
+ if not self._is_valid_assertion_label(assertion.label):
434
+ block_warnings.append(ParseWarning(
435
+ requirement_id=req_id,
436
+ message=f"Invalid assertion label format: {assertion.label}",
437
+ file_path=file_path,
438
+ line_number=line_number,
439
+ ))
440
+
441
+ # Extract hash from end marker
442
+ hash_value = None
443
+ end_match = self.END_MARKER_PATTERN.search(text)
444
+ if end_match:
445
+ hash_value = end_match.group("hash")
446
+
447
+ req = Requirement(
448
+ id=req_id,
449
+ title=title,
450
+ level=level,
451
+ status=status,
452
+ body=body,
453
+ implements=implements,
454
+ acceptance_criteria=acceptance_criteria,
455
+ assertions=assertions,
456
+ rationale=rationale,
457
+ hash=hash_value,
458
+ file_path=file_path,
459
+ line_number=line_number,
460
+ subdir=subdir,
461
+ )
462
+ return req, block_warnings
463
+
464
+ def _is_valid_assertion_label(self, label: str) -> bool:
465
+ """Check if an assertion label matches expected format.
466
+
467
+ Default expectation is uppercase letters A-Z.
468
+ """
469
+ # Check against configured assertion label pattern if available
470
+ assertion_config = getattr(self.pattern_config, 'assertions', None)
471
+ if assertion_config:
472
+ label_style = assertion_config.get('label_style', 'uppercase')
473
+ if label_style == 'uppercase':
474
+ return bool(re.match(r'^[A-Z]$', label))
475
+ elif label_style == 'numeric':
476
+ return bool(re.match(r'^\d+$', label))
477
+ elif label_style == 'alphanumeric':
478
+ return bool(re.match(r'^[A-Z0-9]+$', label))
479
+ # Default: uppercase single letter
480
+ return bool(re.match(r'^[A-Z]$', label))
481
+
482
+ def _parse_implements(self, implements_str: str) -> List[str]:
483
+ """Parse comma-separated implements list.
484
+
485
+ Returns empty list if the value is a "no reference" indicator.
486
+ """
487
+ if not implements_str:
488
+ return []
489
+
490
+ # Check if it's a "no reference" value
491
+ stripped = implements_str.strip()
492
+ if stripped in self.no_reference_values:
493
+ return []
494
+
495
+ parts = [p.strip() for p in implements_str.split(",")]
496
+ # Filter out empty parts and no-reference values
497
+ return [p for p in parts if p and p not in self.no_reference_values]
498
+
499
+ def _extract_body(self, text: str) -> str:
500
+ """Extract the main body text from requirement block.
501
+
502
+ Body is everything between the header (and optional metadata line)
503
+ and the end marker, including Rationale and Acceptance Criteria sections.
504
+ Trailing blank lines are removed for consistent hashing.
505
+ """
506
+ lines = text.split("\n")
507
+ body_lines = []
508
+ found_header = False
509
+ in_body = False
510
+
511
+ for line in lines:
512
+ # Skip header line
513
+ if self.HEADER_PATTERN.match(line):
514
+ found_header = True
515
+ continue
516
+
517
+ if found_header and not in_body:
518
+ # Metadata line - skip it but mark body start
519
+ if "**Level**" in line or "**Status**" in line:
520
+ in_body = True
521
+ continue
522
+ # First non-blank content line starts body (when no metadata)
523
+ elif line.strip():
524
+ in_body = True
525
+ # Don't continue - include this line in body
526
+
527
+ # Stop at end marker
528
+ if line.strip().startswith("*End*"):
529
+ break
530
+
531
+ if in_body:
532
+ body_lines.append(line)
533
+
534
+ # Remove trailing blank lines (matches hht-diary clean_requirement_body)
535
+ while body_lines and not body_lines[-1].strip():
536
+ body_lines.pop()
537
+
538
+ # Strip trailing whitespace from result
539
+ return "\n".join(body_lines).rstrip()
540
+
541
+ def _extract_assertions(self, text: str) -> List[Assertion]:
542
+ """Extract assertions from requirement text.
543
+
544
+ Looks for `## Assertions` section and parses lines like:
545
+ A. The system SHALL...
546
+ B. The system SHALL NOT...
547
+
548
+ Args:
549
+ text: The requirement text block
550
+
551
+ Returns:
552
+ List of Assertion objects
553
+ """
554
+ assertions: List[Assertion] = []
555
+
556
+ # Find the assertions section
557
+ header_match = self.ASSERTIONS_HEADER_PATTERN.search(text)
558
+ if not header_match:
559
+ return assertions
560
+
561
+ # Get text after the header until the next section or end marker
562
+ start_pos = header_match.end()
563
+ section_text = text[start_pos:]
564
+
565
+ # Find the end of the assertions section (next ## header, Rationale, or End marker)
566
+ end_patterns = [
567
+ r"^##\s+", # Next section header
568
+ r"^\*End\*", # End marker
569
+ r"^---\s*$", # Separator line
570
+ ]
571
+ end_pos = len(section_text)
572
+ for pattern in end_patterns:
573
+ match = re.search(pattern, section_text, re.MULTILINE)
574
+ if match and match.start() < end_pos:
575
+ end_pos = match.start()
576
+
577
+ assertions_text = section_text[:end_pos]
578
+
579
+ # Parse individual assertion lines
580
+ for match in self.ASSERTION_LINE_PATTERN.finditer(assertions_text):
581
+ label = match.group(1)
582
+ assertion_text = match.group(2).strip()
583
+
584
+ # Check if this is a placeholder
585
+ is_placeholder = any(
586
+ assertion_text.lower().startswith(pv.lower())
587
+ for pv in self.placeholder_values
588
+ )
589
+
590
+ assertions.append(Assertion(
591
+ label=label,
592
+ text=assertion_text,
593
+ is_placeholder=is_placeholder,
594
+ ))
595
+
596
+ return assertions