invar-tools 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,563 @@
1
+ """
2
+ Markdown document parser for structured document queries.
3
+
4
+ DX-76: Parses markdown into a section tree for precise navigation.
5
+ Core module - pure logic, no I/O.
6
+ """
7
+ # @invar:allow file_size: DX-77 Phase A adds Unicode fuzzy matching, extraction planned
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from dataclasses import dataclass, field
13
+
14
+ from deal import post, pre
15
+ from invar_runtime import skip_property_test
16
+ from markdown_it import MarkdownIt
17
+
18
+
19
+ @dataclass
20
+ class Section:
21
+ """A document section (heading + content).
22
+
23
+ Represents a heading and its content up to the next same-level or higher heading.
24
+
25
+ Examples:
26
+ >>> s = Section(
27
+ ... title="Introduction",
28
+ ... slug="introduction",
29
+ ... level=1,
30
+ ... line_start=1,
31
+ ... line_end=10,
32
+ ... char_count=500,
33
+ ... path="introduction",
34
+ ... )
35
+ >>> s.title
36
+ 'Introduction'
37
+ >>> s.level
38
+ 1
39
+ """
40
+
41
+ title: str # "Authentication"
42
+ slug: str # "authentication"
43
+ level: int # 1-6
44
+ line_start: int # 1-indexed
45
+ line_end: int # 1-indexed, inclusive
46
+ char_count: int # Content character count
47
+ path: str # "requirements/functional/authentication"
48
+ children: list[Section] = field(default_factory=list)
49
+
50
+
51
+ @dataclass
52
+ class FrontMatter:
53
+ """YAML front matter metadata.
54
+
55
+ Examples:
56
+ >>> fm = FrontMatter(line_start=1, line_end=5, content="title: Hello")
57
+ >>> fm.line_start
58
+ 1
59
+ """
60
+
61
+ line_start: int # 1-indexed
62
+ line_end: int # 1-indexed, inclusive
63
+ content: str # Raw YAML content
64
+
65
+
66
+ @dataclass
67
+ class DocumentToc:
68
+ """Table of contents for a document.
69
+
70
+ Examples:
71
+ >>> toc = DocumentToc(sections=[], frontmatter=None)
72
+ >>> toc.sections
73
+ []
74
+ """
75
+
76
+ sections: list[Section]
77
+ frontmatter: FrontMatter | None
78
+
79
+
80
+ @pre(lambda title: len(title) <= 1000) # Reasonable max title length
81
+ @post(lambda result: result == "" or re.match(r"^[a-z0-9]+(-[a-z0-9]+)*$", result))
82
+ def _slugify(title: str) -> str:
83
+ """Convert title to URL-friendly slug.
84
+
85
+ Examples:
86
+ >>> _slugify("Hello World")
87
+ 'hello-world'
88
+ >>> _slugify("API Reference (v2)")
89
+ 'api-reference-v2'
90
+ >>> _slugify(" Multiple Spaces ")
91
+ 'multiple-spaces'
92
+ >>> _slugify("")
93
+ ''
94
+ """
95
+ # Lowercase
96
+ slug = title.lower()
97
+ # Replace non-alphanumeric with hyphens
98
+ slug = re.sub(r"[^a-z0-9]+", "-", slug)
99
+ # Remove leading/trailing hyphens
100
+ slug = slug.strip("-")
101
+ return slug
102
+
103
+
104
+ @skip_property_test("crosshair_incompatible: Unicode character validation conflicts with symbolic execution") # type: ignore[untyped-decorator]
105
+ @pre(lambda text: len(text) <= 1000)
106
+ @post(lambda result: result == '' or all(c.isalnum() or c == '_' or ord(c) > 127 for c in result))
107
+ def _normalize_for_fuzzy(text: str) -> str:
108
+ """
109
+ Normalize text for Unicode-aware fuzzy matching.
110
+
111
+ Removes punctuation and whitespace, converts ASCII to lowercase,
112
+ preserves Unicode characters (Chinese, Japanese, etc.).
113
+
114
+ Examples:
115
+ >>> _normalize_for_fuzzy("Hello World")
116
+ 'helloworld'
117
+ >>> _normalize_for_fuzzy("Phase B")
118
+ 'phaseb'
119
+ >>> _normalize_for_fuzzy("验证计划")
120
+ '验证计划'
121
+ >>> _normalize_for_fuzzy("Phase B 验证计划")
122
+ 'phaseb验证计划'
123
+ >>> _normalize_for_fuzzy(" Multiple Spaces ")
124
+ 'multiplespaces'
125
+ >>> _normalize_for_fuzzy("")
126
+ ''
127
+ >>> _normalize_for_fuzzy("API (v2.0)")
128
+ 'apiv20'
129
+ """
130
+ # Convert ASCII to lowercase, keep Unicode as-is
131
+ ascii_lower = ''.join(c.lower() if c.isascii() else c for c in text)
132
+ # Remove non-word-chars, but keep Unicode letters/digits (via re.UNICODE)
133
+ return re.sub(r'[^\w]', '', ascii_lower, flags=re.UNICODE)
134
+
135
+
136
+ @pre(lambda sections: all(1 <= s.level <= 6 for s in sections)) # Valid heading levels
137
+ @post(lambda result: all(1 <= s.level <= 6 for s in result))
138
+ def _build_section_tree(sections: list[Section]) -> list[Section]:
139
+ """Build hierarchical tree from flat section list.
140
+
141
+ Uses level to determine parent-child relationships.
142
+ Updates path to include parent slugs.
143
+
144
+ Examples:
145
+ >>> s1 = Section("A", "a", 1, 1, 10, 100, "a", [])
146
+ >>> s2 = Section("B", "b", 2, 5, 8, 50, "b", [])
147
+ >>> tree = _build_section_tree([s1, s2])
148
+ >>> len(tree)
149
+ 1
150
+ >>> tree[0].children[0].title
151
+ 'B'
152
+ >>> tree[0].children[0].path
153
+ 'a/b'
154
+
155
+ >>> # Empty list
156
+ >>> _build_section_tree([])
157
+ []
158
+ """
159
+ if not sections:
160
+ return []
161
+
162
+ result: list[Section] = []
163
+ stack: list[Section] = []
164
+
165
+ for section in sections:
166
+ # Pop sections from stack that are not parents of current
167
+ while stack and stack[-1].level >= section.level:
168
+ stack.pop()
169
+
170
+ # Update path based on parent
171
+ if stack:
172
+ section.path = f"{stack[-1].path}/{section.slug}"
173
+ stack[-1].children.append(section)
174
+ else:
175
+ result.append(section)
176
+
177
+ stack.append(section)
178
+
179
+ return result
180
+
181
+
182
+ @skip_property_test("external_io: hypothesis inspect module incompatibility with Python 3.14") # type: ignore[untyped-decorator]
183
+ @pre(lambda source: len(source) <= 10_000_000) # Max 10MB document
184
+ @post(lambda result: all(s.line_start >= 1 for s in result.sections))
185
+ @post(lambda result: all(s.line_end >= s.line_start for s in result.sections))
186
+ @post(lambda result: all(1 <= s.level <= 6 for s in result.sections))
187
+ def parse_toc(source: str) -> DocumentToc:
188
+ """Parse markdown source into a section tree.
189
+
190
+ Extracts headings and builds a hierarchical structure.
191
+ Line numbers are 1-indexed for user display.
192
+
193
+ Examples:
194
+ >>> toc = parse_toc("# Hello\\n\\nWorld")
195
+ >>> len(toc.sections)
196
+ 1
197
+ >>> toc.sections[0].title
198
+ 'Hello'
199
+ >>> toc.sections[0].slug
200
+ 'hello'
201
+ >>> toc.sections[0].level
202
+ 1
203
+
204
+ >>> # Nested headings
205
+ >>> toc2 = parse_toc("# A\\n## B\\n## C\\n# D")
206
+ >>> len(toc2.sections)
207
+ 2
208
+ >>> toc2.sections[0].title
209
+ 'A'
210
+ >>> len(toc2.sections[0].children)
211
+ 2
212
+ >>> toc2.sections[0].children[0].title
213
+ 'B'
214
+
215
+ >>> # Empty document
216
+ >>> toc3 = parse_toc("")
217
+ >>> len(toc3.sections)
218
+ 0
219
+
220
+ >>> # Setext headings
221
+ >>> toc4 = parse_toc("Title\\n=====\\n\\nSubtitle\\n--------")
222
+ >>> toc4.sections[0].title
223
+ 'Title'
224
+ >>> toc4.sections[0].level
225
+ 1
226
+ >>> toc4.sections[0].children[0].title
227
+ 'Subtitle'
228
+ >>> toc4.sections[0].children[0].level
229
+ 2
230
+
231
+ >>> # Front matter
232
+ >>> toc5 = parse_toc("---\\ntitle: Test\\n---\\n# Heading")
233
+ >>> toc5.frontmatter is not None
234
+ True
235
+ >>> toc5.frontmatter.content
236
+ 'title: Test'
237
+ """
238
+ lines = source.split("\n")
239
+ total_lines = len(lines)
240
+
241
+ # Detect and extract front matter
242
+ frontmatter = None
243
+ content_start_line = 0 # 0-indexed
244
+
245
+ if source.startswith("---\n") or source.startswith("---\r\n"):
246
+ # Look for closing ---
247
+ for i, line in enumerate(lines[1:], start=1):
248
+ if line.strip() == "---":
249
+ fm_content = "\n".join(lines[1:i])
250
+ frontmatter = FrontMatter(
251
+ line_start=1,
252
+ line_end=i + 1, # 1-indexed, inclusive
253
+ content=fm_content,
254
+ )
255
+ content_start_line = i + 1
256
+ break
257
+
258
+ # Parse markdown (skip front matter if present)
259
+ content_to_parse = "\n".join(lines[content_start_line:])
260
+ md = MarkdownIt()
261
+ tokens = md.parse(content_to_parse)
262
+
263
+ # Extract headings
264
+ headings: list[tuple[str, int, int, int]] = [] # (title, level, start, end)
265
+ i = 0
266
+ while i < len(tokens):
267
+ token = tokens[i]
268
+ if token.type == "heading_open":
269
+ level = int(token.tag[1]) # h1 -> 1, h2 -> 2, etc.
270
+ token_map = token.map or [0, 1]
271
+ # Adjust for front matter offset
272
+ start_line = token_map[0] + content_start_line + 1 # 1-indexed
273
+ end_line = token_map[1] + content_start_line # 1-indexed
274
+
275
+ # Next token should be inline with content
276
+ if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
277
+ title = tokens[i + 1].content or ""
278
+ headings.append((title, level, start_line, end_line))
279
+ i += 1
280
+ i += 1
281
+
282
+ # Build section tree
283
+ if not headings:
284
+ return DocumentToc(sections=[], frontmatter=frontmatter)
285
+
286
+ # Calculate end lines for each section (until next heading or EOF)
287
+ sections_flat: list[Section] = []
288
+ for idx, (title, level, start, _) in enumerate(headings):
289
+ # Line before next heading, or EOF
290
+ end = headings[idx + 1][2] - 1 if idx + 1 < len(headings) else total_lines
291
+
292
+ # Calculate char count
293
+ section_lines = lines[start - 1 : end] # Convert to 0-indexed
294
+ char_count = sum(len(line) for line in section_lines)
295
+
296
+ slug = _slugify(title)
297
+ sections_flat.append(
298
+ Section(
299
+ title=title,
300
+ slug=slug,
301
+ level=level,
302
+ line_start=start,
303
+ line_end=end,
304
+ char_count=char_count,
305
+ path=slug, # Will be updated during tree building
306
+ children=[],
307
+ )
308
+ )
309
+
310
+ # Build tree from flat list
311
+ root_sections = _build_section_tree(sections_flat)
312
+
313
+ return DocumentToc(sections=root_sections, frontmatter=frontmatter)
314
+
315
+
316
+ @pre(lambda sections, target_line: target_line >= 1)
317
+ @post(lambda result: result is None or isinstance(result, Section))
318
+ def _find_by_line(sections: list[Section], target_line: int) -> Section | None:
319
+ """Find section by line number.
320
+
321
+ Examples:
322
+ >>> s = Section("A", "a", 1, 5, 10, 100, "a", [])
323
+ >>> _find_by_line([s], 5).title
324
+ 'A'
325
+ >>> _find_by_line([s], 1) is None
326
+ True
327
+ >>> _find_by_line([], 5) is None
328
+ True
329
+ """
330
+ for section in sections:
331
+ if section.line_start == target_line:
332
+ return section
333
+ # Search children
334
+ found = _find_by_line(section.children, target_line)
335
+ if found:
336
+ return found
337
+ return None
338
+
339
+
340
+ @pre(lambda sections, path: len(path) > 0 and path.startswith("#"))
341
+ @post(lambda result: result is None or isinstance(result, Section))
342
+ def _find_by_index(sections: list[Section], path: str) -> Section | None:
343
+ """Find section by index path (#0/#1/#2).
344
+
345
+ Examples:
346
+ >>> s = Section("A", "a", 1, 1, 10, 100, "a", [
347
+ ... Section("B", "b", 2, 3, 8, 50, "a/b", [])
348
+ ... ])
349
+ >>> _find_by_index([s], "#0").title
350
+ 'A'
351
+ >>> _find_by_index([s], "#0/#0").title
352
+ 'B'
353
+ >>> _find_by_index([s], "#1") is None
354
+ True
355
+ >>> _find_by_index([], "#0") is None
356
+ True
357
+ """
358
+ parts = path.split("/")
359
+ current_list = sections
360
+
361
+ for i, part in enumerate(parts):
362
+ if not part.startswith("#"):
363
+ return None
364
+ try:
365
+ idx = int(part[1:])
366
+ except ValueError:
367
+ return None
368
+
369
+ if idx < 0 or idx >= len(current_list):
370
+ return None
371
+
372
+ section = current_list[idx]
373
+ if i == len(parts) - 1: # Last part
374
+ return section
375
+ current_list = section.children
376
+
377
+ return None
378
+
379
+
380
+ @skip_property_test("crosshair_incompatible: Calls _normalize_for_fuzzy with Unicode validation") # type: ignore[untyped-decorator]
381
+ @pre(lambda sections, path: len(path) > 0)
382
+ @post(lambda result: result is None or isinstance(result, Section))
383
+ def _find_by_slug_or_fuzzy(sections: list[Section], path: str) -> Section | None:
384
+ """Find section by slug path or fuzzy match.
385
+
386
+ Examples:
387
+ >>> s = Section("Intro", "intro", 1, 1, 10, 100, "intro", [
388
+ ... Section("Overview", "overview", 2, 3, 8, 50, "intro/overview", [])
389
+ ... ])
390
+ >>> _find_by_slug_or_fuzzy([s], "intro/overview").title
391
+ 'Overview'
392
+ >>> _find_by_slug_or_fuzzy([s], "over").title
393
+ 'Overview'
394
+ >>> _find_by_slug_or_fuzzy([s], "nonexistent") is None
395
+ True
396
+ >>> _find_by_slug_or_fuzzy([], "anything") is None
397
+ True
398
+ """
399
+ path_lower = path.lower()
400
+
401
+ # Try exact slug path match first
402
+ def find_exact(secs: list[Section], remaining_path: str) -> Section | None:
403
+ if "/" in remaining_path:
404
+ first, rest = remaining_path.split("/", 1)
405
+ for sec in secs:
406
+ if sec.slug == first:
407
+ return find_exact(sec.children, rest)
408
+ return None
409
+ else:
410
+ for sec in secs:
411
+ if sec.slug == remaining_path:
412
+ return sec
413
+ return None
414
+
415
+ exact = find_exact(sections, path_lower)
416
+ if exact:
417
+ return exact
418
+
419
+ # Fuzzy match with Unicode-aware normalization
420
+ def find_fuzzy(secs: list[Section]) -> Section | None:
421
+ normalized_path = _normalize_for_fuzzy(path)
422
+ for sec in secs:
423
+ normalized_slug = _normalize_for_fuzzy(sec.slug)
424
+ normalized_title = _normalize_for_fuzzy(sec.title)
425
+ if normalized_path in normalized_slug or normalized_path in normalized_title:
426
+ return sec
427
+ found = find_fuzzy(sec.children)
428
+ if found:
429
+ return found
430
+ return None
431
+
432
+ return find_fuzzy(sections)
433
+
434
+
435
+ @skip_property_test("crosshair_incompatible: Calls _find_by_slug_or_fuzzy with Unicode validation") # type: ignore[untyped-decorator]
436
+ @pre(lambda sections, path: len(path) > 0)
437
+ @post(lambda result: result is None or isinstance(result, Section))
438
+ def find_section(sections: list[Section], path: str) -> Section | None:
439
+ """Find section by path (slug, fuzzy, index, or line anchor).
440
+
441
+ Path formats:
442
+ - Slug path: "requirements/functional/auth" (case-insensitive)
443
+ - Fuzzy: "auth" (matches first containing section)
444
+ - Index: "#0/#1" (0-indexed positional)
445
+ - Line anchor: "@48" (section starting at line 48)
446
+
447
+ Examples:
448
+ >>> sections = [
449
+ ... Section("Intro", "intro", 1, 1, 10, 100, "intro", [
450
+ ... Section("Overview", "overview", 2, 3, 8, 50, "intro/overview", [])
451
+ ... ])
452
+ ... ]
453
+
454
+ >>> # Slug path
455
+ >>> s = find_section(sections, "intro/overview")
456
+ >>> s.title
457
+ 'Overview'
458
+
459
+ >>> # Fuzzy match
460
+ >>> s2 = find_section(sections, "over")
461
+ >>> s2.title
462
+ 'Overview'
463
+
464
+ >>> # Index path
465
+ >>> s3 = find_section(sections, "#0/#0")
466
+ >>> s3.title
467
+ 'Overview'
468
+
469
+ >>> # Line anchor
470
+ >>> s4 = find_section(sections, "@3")
471
+ >>> s4.title
472
+ 'Overview'
473
+
474
+ >>> # Not found
475
+ >>> find_section(sections, "nonexistent") is None
476
+ True
477
+ """
478
+ # Line anchor: @48
479
+ if path.startswith("@"):
480
+ try:
481
+ target_line = int(path[1:])
482
+ return _find_by_line(sections, target_line)
483
+ except ValueError:
484
+ return None
485
+
486
+ # Index path: #0/#1/#2
487
+ if path.startswith("#"):
488
+ return _find_by_index(sections, path)
489
+
490
+ # Slug path or fuzzy match
491
+ return _find_by_slug_or_fuzzy(sections, path) # type: ignore[no-any-return]
492
+
493
+
494
+ @pre(lambda section: section.line_end >= section.line_start)
495
+ @pre(lambda section: section.line_start >= 1)
496
+ @post(lambda result: result >= 1)
497
+ def _get_last_line(section: Section) -> int:
498
+ """Get the last line number of a section, including all descendants.
499
+
500
+ Examples:
501
+ >>> s = Section("Title", "title", 1, 1, 5, 100, "title", [])
502
+ >>> _get_last_line(s)
503
+ 5
504
+ >>> parent = Section("Parent", "parent", 1, 1, 4, 100, "parent", [
505
+ ... Section("Child", "child", 2, 5, 8, 50, "parent/child", [])
506
+ ... ])
507
+ >>> _get_last_line(parent)
508
+ 8
509
+ """
510
+ if not section.children:
511
+ return section.line_end
512
+
513
+ # Recursively find the last line of the last child
514
+ last_child = section.children[-1]
515
+ return _get_last_line(last_child)
516
+
517
+
518
+ @pre(lambda source, section, include_children=True: section.line_start >= 1)
519
+ @pre(lambda source, section, include_children=True: section.line_end >= section.line_start)
520
+ @pre(lambda source, section, include_children=True: section.line_end <= len(source.split("\n"))) # Bounds check
521
+ def extract_content(source: str, section: Section, include_children: bool = True) -> str:
522
+ """Extract section content from source.
523
+
524
+ Returns the content from line_start to line_end (1-indexed, inclusive).
525
+ When include_children=False, stops at first child heading.
526
+ When include_children=True, includes all descendant sections.
527
+
528
+ Examples:
529
+ >>> source = "# Title\\n\\nParagraph one.\\n\\nParagraph two."
530
+ >>> section = Section("Title", "title", 1, 1, 5, 50, "title", [])
531
+ >>> content = extract_content(source, section)
532
+ >>> "# Title" in content
533
+ True
534
+ >>> "Paragraph one" in content
535
+ True
536
+
537
+ >>> # Without children
538
+ >>> parent = Section("Parent", "parent", 1, 1, 4, 100, "parent", [
539
+ ... Section("Child", "child", 2, 3, 4, 50, "parent/child", [])
540
+ ... ])
541
+ >>> src = "# Parent\\nIntro\\n## Child\\nBody"
542
+ >>> extract_content(src, parent, include_children=False)
543
+ '# Parent\\nIntro'
544
+
545
+ >>> # With children
546
+ >>> extract_content(src, parent, include_children=True)
547
+ '# Parent\\nIntro\\n## Child\\nBody'
548
+ """
549
+ lines = source.split("\n")
550
+ start_idx = section.line_start - 1
551
+
552
+ if include_children:
553
+ # Include all descendants
554
+ end_idx = _get_last_line(section)
555
+ elif not section.children:
556
+ # No children to exclude
557
+ end_idx = section.line_end
558
+ else:
559
+ # Stop before first child
560
+ first_child_line = section.children[0].line_start
561
+ end_idx = first_child_line - 1
562
+
563
+ return "\n".join(lines[start_idx:end_idx])
@@ -28,7 +28,9 @@ class TSSymbol:
28
28
 
29
29
 
30
30
  # Regex patterns for TypeScript constructs
31
- # Note: These are simplified patterns suitable for common cases
31
+ # Note: These are simplified patterns suitable for common cases.
32
+ # Known limitation: Multiline parameter lists are truncated to first line.
33
+ # Phase 2 can upgrade to tree-sitter for full multiline support.
32
34
  _FUNCTION_PATTERN = re.compile(
33
35
  r"^\s*(?:@\w+(?:\([^)]*\))?\s*\n\s*)*" # Optional decorators
34
36
  r"(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*"
@@ -79,8 +81,9 @@ _JSDOC_PATTERN = re.compile(
79
81
 
80
82
 
81
83
  # @invar:allow function_size: Regex extraction inherently repetitive per TS construct type
82
- @pre(lambda source: source is not None) # Accepts any string including empty
83
- @post(lambda result: all(s.line > 0 for s in result)) # All symbols have valid line numbers
84
+ # @invar:allow redundant_type_contract: Defense-in-depth for dynamic callers
85
+ @pre(lambda source: isinstance(source, str) and len(source) < 10_000_000) # ~10MB DoS limit
86
+ @post(lambda result: all(s.line > 0 and s.name for s in result)) # Valid line numbers and names
84
87
  def extract_ts_signatures(source: str) -> list[TSSymbol]:
85
88
  """Extract TypeScript symbols from source code.
86
89