markitecture 0.1.15__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. markitecture/__init__.py +41 -0
  2. markitecture/__main__.py +4 -0
  3. markitecture/cli/__init__.py +3 -0
  4. markitecture/cli/app.py +38 -0
  5. markitecture/cli/commands/__init__.py +21 -0
  6. markitecture/cli/commands/config.py +84 -0
  7. markitecture/cli/commands/links.py +146 -0
  8. markitecture/cli/commands/metrics.py +193 -0
  9. markitecture/cli/commands/mkdocs.py +39 -0
  10. markitecture/cli/commands/split.py +48 -0
  11. markitecture/errors.py +64 -0
  12. markitecture/generators/__init__.py +3 -0
  13. markitecture/generators/configs/__init__.py +0 -0
  14. markitecture/generators/configs/mintlify_json.py +0 -0
  15. markitecture/generators/configs/mkdocs_yaml.py +317 -0
  16. markitecture/metrics/__init__.py +9 -0
  17. markitecture/metrics/analyzer.py +109 -0
  18. markitecture/metrics/badges/__init__.py +28 -0
  19. markitecture/metrics/badges/base.py +7 -0
  20. markitecture/metrics/badges/compact.py +35 -0
  21. markitecture/metrics/badges/detailed.py +60 -0
  22. markitecture/metrics/badges/minimal.py +19 -0
  23. markitecture/metrics/badges/modern.py +45 -0
  24. markitecture/metrics/badges/retro.py +23 -0
  25. markitecture/metrics/badges/shields.py +124 -0
  26. markitecture/metrics/svg_generator.py +70 -0
  27. markitecture/processing/__init__.py +0 -0
  28. markitecture/processing/link_validator.py +133 -0
  29. markitecture/processing/reflink_converter.py +198 -0
  30. markitecture/processing/reflink_extractor.py +82 -0
  31. markitecture/processing/text_splitter.py +290 -0
  32. markitecture/settings/__init__.py +9 -0
  33. markitecture/settings/config.py +61 -0
  34. markitecture/settings/validators.py +26 -0
  35. markitecture/utils/__init__.py +5 -0
  36. markitecture/utils/file_handler.py +24 -0
  37. markitecture/utils/printer.py +195 -0
  38. markitecture/utils/sanitizer.py +78 -0
  39. markitecture-0.1.15.dist-info/METADATA +271 -0
  40. markitecture-0.1.15.dist-info/RECORD +43 -0
  41. markitecture-0.1.15.dist-info/WHEEL +4 -0
  42. markitecture-0.1.15.dist-info/entry_points.txt +2 -0
  43. markitecture-0.1.15.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,290 @@
1
+ """Text splitting methods for parsing markdown content into sections."""
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Dict, List
7
+
8
+ from markitecture.processing.reflink_extractor import ReferenceLinkExtractor
9
+ from markitecture.utils.printer import RichPrinter
10
+ from markitecture.utils.sanitizer import sanitize_filename
11
+
12
+ _printer = RichPrinter()
13
+
14
+
15
+ @dataclass
16
+ class Section:
17
+ """
18
+ Represents a split markdown section.
19
+ """
20
+
21
+ title: str
22
+ content: str
23
+ level: int
24
+ filename: Path
25
+ parent_context: str | None = None
26
+ references: dict[str, str] | None = None
27
+
28
+ def __post_init__(self) -> None:
29
+ """Initialize references as an empty dictionary if not provided."""
30
+ if self.references is None:
31
+ self.references = {}
32
+
33
+
34
+ class MarkdownTextSplitter:
35
+ """
36
+ Split markdown content into sections based on specified heading level.
37
+ """
38
+
39
+ def __init__(self, settings: object = None) -> None:
40
+ from markitecture.cli.app import MarkitectureApp
41
+
42
+ self.settings = settings or MarkitectureApp()
43
+ self._compile_patterns()
44
+ _printer.print_debug(
45
+ f"MarkdownSplitter initialized with settings: {self.settings}"
46
+ )
47
+
48
+ def process_file(self, content: str) -> List[Section]:
49
+ """Process markdown file, split it, and handle additional steps."""
50
+ _printer.print_info("Processing markdown content...")
51
+ sections = self.split(content)
52
+ output_dir = Path(self.settings.split.output_dir)
53
+ output_dir.mkdir(parents=True, exist_ok=True)
54
+ _printer.print_debug(f"Verified output directory: {output_dir}")
55
+
56
+ for section in sections:
57
+ section_path = output_dir / section.filename
58
+ _printer.print_debug(f"Writing section '{section.title}' to {section_path}")
59
+ section_path.write_text(section.content, encoding="utf-8")
60
+
61
+ if hasattr(self.settings, "process_mkdocs"):
62
+ _printer.print_info(
63
+ f"Processing mkdocs.yml with settings: {self.settings.settings.mkdocs}"
64
+ )
65
+ self.settings.process_mkdocs(sections)
66
+
67
+ _printer.print_info("File processing completed successfully")
68
+ return sections
69
+
70
+ def split(self, content: str) -> List[Section]:
71
+ """
72
+ Split markdown content into sections based on specified heading level.
73
+ Respects heading hierarchy - only splits at specified level and includes
74
+ appropriate nested content without including higher-level content.
75
+ Properly handles code blocks and comments within headings.
76
+ """
77
+ _printer.print_info("Executing text splitting...")
78
+
79
+ ref_handler = ReferenceLinkExtractor(content)
80
+
81
+ _printer.print_debug(
82
+ f"Extracted {len(ref_handler.references)} references from content"
83
+ )
84
+
85
+ # First, identify all code block positions to exclude them from heading search
86
+ code_blocks = []
87
+
88
+ # Match fenced code blocks (both ``` and ~~~)
89
+ fenced_blocks = re.finditer(
90
+ r"(?:```|~~~)[^\n]*\n.*?(?:```|~~~)", content, re.DOTALL
91
+ )
92
+ code_blocks.extend(fenced_blocks)
93
+
94
+ # Match indented code blocks (4 spaces or 1 tab)
95
+ lines = content.split("\n")
96
+ i = 0
97
+ while i < len(lines):
98
+ if re.match(r"^(?:\s{4}|\t).*$", lines[i]):
99
+ # Found start of indented block
100
+ start_pos = len("\n".join(lines[:i]))
101
+ # Find end of block
102
+ while i < len(lines) and (
103
+ re.match(r"^(?:\s{4}|\t).*$", lines[i]) or lines[i].strip() == ""
104
+ ):
105
+ i += 1
106
+ end_pos = len("\n".join(lines[:i]))
107
+ # Create a proper class instance for block matching
108
+
109
+ class BlockMatch:
110
+ def __init__(self, start_pos, end_pos):
111
+ self._start = start_pos
112
+ self._end = end_pos
113
+
114
+ def start(self, *args):
115
+ return self._start
116
+
117
+ def end(self, *args):
118
+ return self._end
119
+
120
+ code_blocks.append(BlockMatch(start_pos, end_pos))
121
+ i += 1
122
+
123
+ # Find all headings of any level (# through ######), excluding those in code blocks
124
+ all_headings = []
125
+ for match in re.finditer(
126
+ r"^(#{1,6})\s+(.+?)(?:\s+<!--.*?-->)*\s*$", content, re.MULTILINE
127
+ ):
128
+ # Check if this heading is inside any code block
129
+ is_in_code_block = any(
130
+ block.start() <= match.start() <= block.end() for block in code_blocks
131
+ )
132
+ if not is_in_code_block:
133
+ all_headings.append(match)
134
+
135
+ headings = all_headings
136
+ if not headings:
137
+ _printer.print_info("No headings found. Creating single README section.")
138
+ section = self._create_section(
139
+ title="README",
140
+ content=content,
141
+ level=0,
142
+ references=ref_handler.references,
143
+ )
144
+ return [section]
145
+
146
+ # Target heading level is determined by number of # in settings
147
+ target_level = len(self.settings.model_dump()["split"]["heading_level"])
148
+ sections = []
149
+
150
+ # Track the current section being built
151
+ current_section_start = None
152
+ current_section_title = None
153
+
154
+ for i, match in enumerate(headings):
155
+ heading_level = len(match.group(1)) # Number of # symbols
156
+ heading_title = match.group(2).strip()
157
+ heading_start = match.start()
158
+
159
+ # Determine where this heading's content ends
160
+ next_heading_start = (
161
+ headings[i + 1].start() if i < len(headings) - 1 else len(content)
162
+ )
163
+
164
+ if heading_level == target_level:
165
+ # If we were building a previous section, finalize it
166
+ if current_section_start is not None:
167
+ section_content = content[
168
+ current_section_start:heading_start
169
+ ].strip()
170
+ section_refs = ref_handler.find_used_references(section_content)
171
+
172
+ sections.append(
173
+ self._create_section(
174
+ title=current_section_title,
175
+ content=self._format_section_content(
176
+ section_content, section_refs
177
+ ),
178
+ level=target_level,
179
+ references=section_refs,
180
+ )
181
+ )
182
+
183
+ # Start a new section
184
+ current_section_start = heading_start
185
+ current_section_title = heading_title
186
+
187
+ elif heading_level > target_level and current_section_start is not None:
188
+ # This is nested content for the current section, do nothing
189
+ continue
190
+
191
+ elif heading_level < target_level:
192
+ # This is a higher-level heading, ignore its content
193
+ if current_section_start is not None:
194
+ section_content = content[
195
+ current_section_start:heading_start
196
+ ].strip()
197
+ section_refs = ref_handler.find_used_references(section_content)
198
+
199
+ sections.append(
200
+ self._create_section(
201
+ title=current_section_title,
202
+ content=self._format_section_content(
203
+ section_content, section_refs
204
+ ),
205
+ level=target_level,
206
+ references=section_refs,
207
+ )
208
+ )
209
+ current_section_start = None
210
+ current_section_title = None
211
+
212
+ # Handle the last section if we were building one
213
+ if current_section_start is not None:
214
+ section_content = content[current_section_start:].strip()
215
+ section_refs = ref_handler.find_used_references(section_content)
216
+ sections.append(
217
+ self._create_section(
218
+ title=current_section_title,
219
+ content=self._format_section_content(section_content, section_refs),
220
+ level=target_level,
221
+ references=section_refs,
222
+ )
223
+ )
224
+
225
+ _printer.print_info(
226
+ f"Successfully split document into {len(sections)} sections."
227
+ )
228
+ return sections
229
+
230
+ def _compile_patterns(self) -> None:
231
+ """Compile regex patterns based on settings."""
232
+ flags = (
233
+ 0
234
+ if self.settings.model_dump()["split"]["case_sensitive"]
235
+ else re.IGNORECASE
236
+ )
237
+ self.heading_pattern = re.compile(
238
+ f"^({re.escape(self.settings.model_dump()['split']['heading_level'])})\\s+(.+?)(?:\\s+<!--.*?-->)*\\s*$",
239
+ re.MULTILINE | flags,
240
+ )
241
+ self.reference_pattern = re.compile(r"^\[([^\]]+)\]:\s+(.+)$", re.MULTILINE)
242
+ self.reference_usage = re.compile(r"\[([^\]]+)\](?!\()", re.MULTILINE)
243
+
244
+ def _create_section(
245
+ self, title: str, content: str, level: int, references: Dict[str, str]
246
+ ) -> Section:
247
+ """Create a new Section object."""
248
+ _printer.print_debug(f"Creating section with title: {title}, level: {level}")
249
+ return Section(
250
+ title=title,
251
+ content=content,
252
+ level=level,
253
+ filename=sanitize_filename(text=title),
254
+ references=references,
255
+ )
256
+
257
+ def _format_section_content(self, content: str, references: Dict[str, str]) -> str:
258
+ """
259
+ Format section content with references and ensure proper spacing.
260
+
261
+ Args:
262
+ content: The main content of the section
263
+ references: Dictionary of reference names to their URLs that are
264
+ actually used in this section
265
+
266
+ Returns:
267
+ Formatted content with thematic break, references, and proper spacing
268
+ """
269
+ if not content:
270
+ return ""
271
+
272
+ # Prepare the base content by trimming trailing whitespace
273
+ base_content = content.rstrip()
274
+
275
+ # Check if content already ends with a thematic break
276
+ hr_pattern = re.compile(r"\n[*_-]{3,}\s*$")
277
+
278
+ # Add thematic break if one doesn't exist
279
+ if not hr_pattern.search(base_content):
280
+ base_content += "\n\n---"
281
+
282
+ # Only add references if there are any used in this section
283
+ if references:
284
+ ref_text = "\n\n<!-- REFERENCE LINKS -->\n"
285
+ for ref_name, ref_url in sorted(references.items()):
286
+ ref_text += f"[{ref_name}]: {ref_url}\n"
287
+ base_content += ref_text
288
+
289
+ # Ensure the file ends with exactly one newline
290
+ return base_content.rstrip() + "\n"
@@ -0,0 +1,9 @@
1
+ from .config import MarkitectureApp
2
+ from .validators import ExistingFilePath, convert_to_path, validate_path
3
+
4
+ __all__ = [
5
+ "ExistingFilePath",
6
+ "MarkitectureApp",
7
+ "convert_to_path",
8
+ "validate_path",
9
+ ]
@@ -0,0 +1,61 @@
1
+ """CLI settings implementated using Pydantic Settings Management."""
2
+
3
+ from pydantic import AliasChoices, Field
4
+ from pydantic_settings import BaseSettings, SettingsConfigDict
5
+
6
+ from markitecture.cli.commands.config import ConfigCommand
7
+ from markitecture.cli.commands.links import CheckLinksCommand, ReferenceLinksCommand
8
+ from markitecture.cli.commands.metrics import MetricsCommand
9
+ from markitecture.cli.commands.mkdocs import MkDocsCommand
10
+ from markitecture.cli.commands.split import SplitCommand
11
+
12
+
13
+ class MarkitectureApp(BaseSettings):
14
+ """
15
+ Main CLI interface for markitecture.
16
+ """
17
+
18
+ config: ConfigCommand | None = Field(
19
+ default=None,
20
+ description="Manage configuration settings",
21
+ validation_alias=AliasChoices("c", "config"),
22
+ )
23
+ check_links: CheckLinksCommand | None = Field(
24
+ default=None,
25
+ description="Validate links in a markdown file",
26
+ validation_alias=AliasChoices("cl", "check-links"),
27
+ )
28
+ reference_links: ReferenceLinksCommand | None = Field(
29
+ default=None,
30
+ description="Convert links to reference style",
31
+ validation_alias=AliasChoices("rl", "reflinks"),
32
+ )
33
+ split: SplitCommand | None = Field(
34
+ default=None,
35
+ description="Split a markdown file into sections",
36
+ validation_alias=AliasChoices("s", "split"),
37
+ )
38
+ metrics: MetricsCommand | None = Field(
39
+ default=None,
40
+ description="Generate document readability metrics",
41
+ validation_alias=AliasChoices("m", "metrics"),
42
+ )
43
+ mkdocs: MkDocsCommand | None = Field(
44
+ default=None,
45
+ description="Generate MkDocs configuration from a Markdown file",
46
+ validation_alias=AliasChoices("mk", "mkdocs"),
47
+ )
48
+ version: bool = Field(
49
+ default=False,
50
+ description="Display the version number",
51
+ validation_alias=AliasChoices("v", "version"),
52
+ )
53
+
54
+ model_config = SettingsConfigDict(
55
+ case_sensitive=False,
56
+ cli_enforce_required=False,
57
+ cli_implicit_flags=True,
58
+ cli_parse_args=True,
59
+ env_prefix="MARKITECTURE_",
60
+ extra="allow",
61
+ )
@@ -0,0 +1,26 @@
1
+ """Pydantic functions and type annotations to validate user input."""
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated
5
+
6
+ from pydantic import AfterValidator
7
+
8
+ from markitecture.errors import InvalidPathError
9
+
10
+
11
+ def convert_to_path(v: str) -> Path:
12
+ """Convert the path string to a Path object."""
13
+ return Path(v)
14
+
15
+
16
+ def validate_path(v: Path) -> Path:
17
+ """Ensure the path exists and is a file."""
18
+ if not v.exists() or not v.is_file():
19
+ raise InvalidPathError(
20
+ message="The provided path does not exist or is not a file.",
21
+ path=str(v),
22
+ )
23
+ return v
24
+
25
+
26
+ ExistingFilePath = Annotated[Path, AfterValidator(validate_path)]
@@ -0,0 +1,5 @@
1
+ from .file_handler import FileHandler
2
+ from .printer import RichPrinter
3
+ from .sanitizer import sanitize_filename
4
+
5
+ __all__ = ["FileHandler", "RichPrinter", "sanitize_filename"]
@@ -0,0 +1,24 @@
1
+ """File handling utilities with error handling."""
2
+
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ from markitecture.errors import FileOperationError
7
+
8
+
9
+ class FileHandler:
10
+ """Handles file operations with proper error handling."""
11
+
12
+ def write(self, file_path: Union[str, Path], content: str) -> None:
13
+ """Write content to file with error handling."""
14
+ try:
15
+ Path(file_path).write_text(content, encoding="utf-8")
16
+ except Exception as e:
17
+ raise FileOperationError(f"Failed to write to {file_path}: {e}") from e
18
+
19
+ def read(self, file_path: Union[str, Path]) -> str:
20
+ """Read content from file with error handling."""
21
+ try:
22
+ return Path(file_path).read_text(encoding="utf-8")
23
+ except Exception as e:
24
+ raise FileOperationError(f"Failed to read {file_path}: {e}") from e
@@ -0,0 +1,195 @@
1
+ """Enhanced terminal output formatting with integrated table titles."""
2
+
3
+ from typing import List, Optional
4
+
5
+ from rich.box import ROUNDED, SIMPLE
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+ from rich.theme import Theme
9
+
10
+
11
+ class RichPrinter:
12
+ """
13
+ Utility class for Rich-based printing with integrated table titles and clickable links.
14
+ """
15
+
16
+ def __init__(self) -> None:
17
+ """Initialize the RichPrinter with a custom theme and console."""
18
+ self.theme = Theme({
19
+ "info": "cyan",
20
+ "success": "bold green",
21
+ "error": "bold red",
22
+ "warning": "yellow",
23
+ "header": "bold blue",
24
+ "title": "bold magenta",
25
+ "key": "bold white",
26
+ "value": "dim",
27
+ "table_title": "bold white on blue",
28
+ })
29
+ self.console = Console(theme=self.theme)
30
+
31
+ # -------------------------------------------------------------------------
32
+ # Basic text-level messages
33
+ # -------------------------------------------------------------------------
34
+ def print_debug(self, message: str) -> None:
35
+ """Print a debug message."""
36
+ self.console.print(f"[dim]{message}[/dim]")
37
+
38
+ def print_info(self, message: str) -> None:
39
+ """Print an informational message."""
40
+ self.console.print(f"[info]{message}[/info]")
41
+
42
+ def print_success(self, message: str) -> None:
43
+ """Print a success message."""
44
+ self.console.print(f"[success]{message}[/success]")
45
+
46
+ def print_error(self, message: str) -> None:
47
+ """Print an error message."""
48
+ self.console.print(f"[error]{message}[/error]")
49
+
50
+ def print_warning(self, message: str) -> None:
51
+ """Print a warning message."""
52
+ self.console.print(f"[warning]{message}[/warning]")
53
+
54
+ def print_title(self, title: str) -> None:
55
+ """Print a styled title."""
56
+ self.console.print(f"[title]{title}[/title]")
57
+
58
+ def print_version(self, version: str) -> None:
59
+ """Print a styled version number."""
60
+ package_name = __package__.split(".")[0]
61
+ self.console.print(f"[bold green]{package_name}[/bold green] {version}")
62
+
63
+ # -------------------------------------------------------------------------
64
+ # Table printing methods
65
+ # -------------------------------------------------------------------------
66
+ def print_key_value_table(self, title: str, data: dict[str, str]) -> None:
67
+ """
68
+ Print a table with integrated title and key-value pairs.
69
+
70
+ Args:
71
+ title: The title of the table
72
+ data: A dictionary of key-value pairs to display
73
+ """
74
+ # Main container with no border
75
+ main_table = Table(box=None, show_header=False, show_edge=False, padding=0)
76
+ main_table.add_column("content", ratio=1)
77
+
78
+ # Title sub-table
79
+ title_table = Table(box=SIMPLE, show_header=False, padding=(0, 1))
80
+ title_table.add_column("title", style="table_title", ratio=1)
81
+ title_table.add_row(title)
82
+
83
+ # Content sub-table for key-value pairs
84
+ content_table = Table(box=ROUNDED, show_header=False, padding=(0, 1))
85
+ content_table.add_column("Key", style="key", no_wrap=True)
86
+ content_table.add_column("Value", style="value")
87
+
88
+ # Add data rows
89
+ for key, val in data.items():
90
+ content_table.add_row(key, val)
91
+
92
+ main_table.add_row(title_table)
93
+ main_table.add_row(content_table)
94
+
95
+ self.console.print()
96
+ self.console.print(main_table)
97
+ self.console.print()
98
+
99
+ def print_table(
100
+ self, title: str, headers: List[str], rows: List[List[str]]
101
+ ) -> None:
102
+ """
103
+ Print a custom table with integrated title.
104
+
105
+ Args:
106
+ title: The title of the table
107
+ headers: List of column headers
108
+ rows: List of row data, each row being a list of strings
109
+ """
110
+ # Main container
111
+ main_table = Table(box=None, show_header=False, show_edge=False, padding=0)
112
+ main_table.add_column("content", ratio=1)
113
+
114
+ # Title sub-table
115
+ title_table = Table(box=SIMPLE, show_header=False, padding=(0, 1))
116
+ title_table.add_column("title", style="table_title", ratio=1)
117
+ title_table.add_row(title)
118
+
119
+ # Content table
120
+ content_table = Table(
121
+ box=ROUNDED, show_header=True, header_style="bold blue", padding=(0, 1)
122
+ )
123
+
124
+ for header in headers:
125
+ content_table.add_column(header, style="key")
126
+
127
+ for row in rows:
128
+ content_table.add_row(*row)
129
+
130
+ main_table.add_row(title_table)
131
+ main_table.add_row(content_table)
132
+
133
+ self.console.print()
134
+ self.console.print(main_table)
135
+ self.console.print()
136
+
137
+ def print_link_table(
138
+ self, title: str, link_rows: List[dict], columns: Optional[List[str]] = None
139
+ ) -> None:
140
+ """
141
+ Print a table specifically for link data, allowing clickable URLs.
142
+
143
+ Each element in link_rows is expected to be a dict with
144
+ keys like 'line', 'url', 'status', 'error' (depending on your link checking code).
145
+
146
+ Args:
147
+ title: The table title
148
+ link_rows: A list of dicts representing link info. Must have 'url' at least.
149
+ columns: Optional list of columns to display in table order.
150
+ If None, uses ["line", "status", "url", "error"] by default.
151
+ """
152
+ if columns is None:
153
+ columns = ["line", "status", "url", "error"]
154
+
155
+ # Create main container
156
+ main_table = Table(box=None, show_header=False, show_edge=False, padding=0)
157
+ main_table.add_column("content", ratio=1)
158
+
159
+ # Title sub-table
160
+ title_table = Table(box=SIMPLE, show_header=False, padding=(0, 1))
161
+ title_table.add_column("title", style="table_title", ratio=1)
162
+ title_table.add_row(title)
163
+
164
+ # Content table
165
+ content_table = Table(
166
+ box=ROUNDED,
167
+ show_header=True,
168
+ header_style="bold blue",
169
+ padding=(0, 1),
170
+ collapse_padding=True,
171
+ )
172
+
173
+ # Add columns
174
+ for col in columns:
175
+ content_table.add_column(col.capitalize(), style="key")
176
+
177
+ # Add rows
178
+ for row_data in link_rows:
179
+ row_values = []
180
+ for col in columns:
181
+ val = row_data.get(col, "")
182
+ if col == "url" and isinstance(val, str) and val.startswith("http"):
183
+ # Make it clickable in the terminal
184
+ link_text = f"[link={val}]{val}[/link]"
185
+ row_values.append(link_text)
186
+ else:
187
+ row_values.append(str(val))
188
+ content_table.add_row(*row_values)
189
+
190
+ main_table.add_row(title_table)
191
+ main_table.add_row(content_table)
192
+
193
+ self.console.print()
194
+ self.console.print(main_table)
195
+ self.console.print()
@@ -0,0 +1,78 @@
1
+ """Module for sanitizing markdown headers into safe filenames."""
2
+
3
+ import html
4
+ import re
5
+ from pathlib import Path
6
+
7
+
8
+ def sanitize_filename(text: str, extension: str = ".md") -> Path:
9
+ """
10
+ Convert a markdown header into a safe filename.
11
+
12
+ Args:
13
+ text: The header text to sanitize
14
+ extension: File extension to append (defaults to .md)
15
+
16
+ Returns:
17
+ Path object with sanitized filename
18
+ """
19
+ # Decode HTML entities
20
+ text = html.unescape(text)
21
+
22
+ # Remove markdown heading markers
23
+ text = re.sub(r"^#+\s*", "", text)
24
+
25
+ # Remove image references and other markdown links
26
+ text = re.sub(r"!\[([^\]]*)\]\[[^\]]*\]", r"\1", text) # Image references
27
+ text = re.sub(r"\[([^\]]*)\]\[[^\]]*\]", r"\1", text) # Regular references
28
+
29
+ # Remove HTML tags and attributes (inline HTML)
30
+ text = re.sub(r"<[^>]+>", "", text)
31
+
32
+ # Remove markdown attributes in curly braces (e.g., {#custom-id}, {#})
33
+ text = re.sub(r"\{[^}]*\}", "", text)
34
+
35
+ # Remove any remaining markdown syntax
36
+ text = re.sub(r"[*_`~]", "", text)
37
+
38
+ # Handle special cases where text is empty
39
+ if not text.strip():
40
+ text = "unnamed-section"
41
+
42
+ # Convert to lowercase and replace spaces/special chars with hyphens
43
+ text = text.strip().lower()
44
+ text = re.sub(r"[^\w\s-]", "", text) # Remove special characters
45
+ text = re.sub(r"[-\s]+", "-", text) # Replace spaces and repeated hyphens
46
+
47
+ # Remove leading/trailing hyphens
48
+ text = text.strip("-")
49
+
50
+ if not text:
51
+ text = "unnamed-section"
52
+
53
+ return Path(f"{text}{extension}")
54
+
55
+
56
+ def extract_image_alt_text(text: str) -> str:
57
+ """Extract alt text from markdown image references.
58
+
59
+ Args:
60
+ text: Text containing markdown image references
61
+
62
+ Returns:
63
+ Extracted alt text or empty string if none found
64
+ """
65
+ match = re.search(r"!\[([^\]]*)\]", text)
66
+ return match.group(1) if match else ""
67
+
68
+
69
+ def strip_markdown_header(text: str) -> str:
70
+ """Remove only the markdown header markers from text.
71
+
72
+ Args:
73
+ text: The header text containing markdown syntax
74
+
75
+ Returns:
76
+ Text with header markers removed but other formatting intact
77
+ """
78
+ return re.sub(r"^#+\s*", "", text)