aimd-cli 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. aimd/AGENTS.md +68 -0
  2. aimd/__init__.py +13 -0
  3. aimd/adapters/AGENTS.md +23 -0
  4. aimd/adapters/__init__.py +1 -0
  5. aimd/adapters/cli/__init__.py +1 -0
  6. aimd/adapters/cli/app.py +216 -0
  7. aimd/application/AGENTS.md +31 -0
  8. aimd/application/__init__.py +14 -0
  9. aimd/application/bootstrap.py +51 -0
  10. aimd/application/models.py +43 -0
  11. aimd/application/services/__init__.py +1 -0
  12. aimd/application/services/interface_payloads.py +81 -0
  13. aimd/application/services/output_writer.py +68 -0
  14. aimd/application/use_cases/__init__.py +1 -0
  15. aimd/application/use_cases/input_routing.py +51 -0
  16. aimd/application/use_cases/list_engines.py +34 -0
  17. aimd/application/use_cases/process_input.py +40 -0
  18. aimd/application/use_cases/processors/__init__.py +13 -0
  19. aimd/application/use_cases/processors/_base.py +17 -0
  20. aimd/application/use_cases/processors/convert.py +35 -0
  21. aimd/application/use_cases/processors/transcript.py +92 -0
  22. aimd/cli.py +9 -0
  23. aimd/const.py +31 -0
  24. aimd/errors.py +41 -0
  25. aimd/infrastructure/AGENTS.md +26 -0
  26. aimd/infrastructure/__init__.py +1 -0
  27. aimd/infrastructure/documents/__init__.py +19 -0
  28. aimd/infrastructure/documents/chunking.py +168 -0
  29. aimd/infrastructure/documents/title_extractor.py +90 -0
  30. aimd/infrastructure/markitdown_processor.py +103 -0
  31. aimd/infrastructure/media_processor.py +51 -0
  32. aimd/platform_utils.py +26 -0
  33. aimd/py.typed +0 -0
  34. aimd/types.py +12 -0
  35. aimd/utils.py +70 -0
  36. aimd_cli-0.9.2.dist-info/METADATA +23 -0
  37. aimd_cli-0.9.2.dist-info/RECORD +39 -0
  38. aimd_cli-0.9.2.dist-info/WHEEL +4 -0
  39. aimd_cli-0.9.2.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,34 @@
1
+ """Use-case for engine capability introspection."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Callable
5
+
6
+ from aimd_media import EngineCapability
7
+ from aimd_media.errors import EngineUnavailableError
8
+
9
+
10
+ @dataclass(slots=True)
11
+ class ListEnginesResult:
12
+ auto_selected_engine: str | None
13
+ engines: dict[str, EngineCapability]
14
+
15
+
16
+ @dataclass(slots=True)
17
+ class ListEnginesUseCase:
18
+ """List available transcription engines and auto-selected preference."""
19
+
20
+ get_capabilities: Callable[[], dict[str, EngineCapability]]
21
+ resolve_engine: Callable[[str], str]
22
+
23
+ def execute(self) -> ListEnginesResult:
24
+ capabilities = self.get_capabilities()
25
+ auto_selected_engine: str | None = None
26
+ try:
27
+ auto_selected_engine = self.resolve_engine("auto")
28
+ except EngineUnavailableError:
29
+ auto_selected_engine = None
30
+
31
+ return ListEnginesResult(
32
+ auto_selected_engine=auto_selected_engine,
33
+ engines=capabilities,
34
+ )
@@ -0,0 +1,40 @@
1
+ """Use-case for input processing orchestration."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Mapping
5
+
6
+ from ...errors import InputNotFoundError, ProcessingFailedError, UnsupportedInputError
7
+ from ..models import InputRoute, ProcessInput, ProcessResult, TaskType
8
+ from .input_routing import FileSupportChecker, ensure_supported_input
9
+ from .processors import TaskProcessor
10
+
11
+
12
+ @dataclass(slots=True)
13
+ class ProcessInputUseCase:
14
+ """Core facade/router for processing routed inputs."""
15
+
16
+ processors: Mapping[TaskType, TaskProcessor]
17
+ is_supported_file: FileSupportChecker
18
+
19
+ def ensure_supported_input(self, input_source: str) -> InputRoute:
20
+ """Validate and return the source/task route for a source."""
21
+ return ensure_supported_input(input_source, self.is_supported_file)
22
+
23
+ async def execute(self, request: ProcessInput) -> ProcessResult:
24
+ route = self.ensure_supported_input(request.input_source)
25
+ task_type = route.task_type
26
+ if task_type is None:
27
+ raise UnsupportedInputError("Unsupported input source.")
28
+
29
+ processor = self.processors.get(task_type)
30
+ if processor is None:
31
+ raise UnsupportedInputError(
32
+ f"No processor configured for task: {task_type}"
33
+ )
34
+
35
+ try:
36
+ return await processor.process(request, route)
37
+ except (InputNotFoundError, UnsupportedInputError, ProcessingFailedError):
38
+ raise
39
+ except Exception as exc:
40
+ raise ProcessingFailedError(str(exc)) from exc
@@ -0,0 +1,13 @@
1
+ """Task processors used by the process input facade."""
2
+
3
+ from ._base import TaskProcessor
4
+ from .convert import ConvertProcessor, ConvertTaskProcessor
5
+ from .transcript import TranscriptProcessor, TranscriptTaskProcessor
6
+
7
+ __all__ = [
8
+ "ConvertProcessor",
9
+ "ConvertTaskProcessor",
10
+ "TaskProcessor",
11
+ "TranscriptProcessor",
12
+ "TranscriptTaskProcessor",
13
+ ]
@@ -0,0 +1,17 @@
1
+ """Base task processor contract."""
2
+
3
+ from typing import Protocol
4
+
5
+ from ...models import InputRoute, ProcessInput, ProcessResult
6
+
7
+
8
+ class TaskProcessor(Protocol):
9
+ """Process one routed task into a canonical result."""
10
+
11
+ async def process(
12
+ self,
13
+ request: ProcessInput,
14
+ route: InputRoute,
15
+ ) -> ProcessResult:
16
+ """Run task-specific processing for a routed input."""
17
+ ...
@@ -0,0 +1,35 @@
1
+ """Document conversion task processor."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Awaitable, Callable
6
+
7
+ from ....types import TextContext
8
+ from ...models import InputRoute, ProcessInput, ProcessResult
9
+
10
+ ConvertProcessor = Callable[
11
+ [str, str, str | None, str | None, Path | None],
12
+ Awaitable[tuple[TextContext, Path | None]],
13
+ ]
14
+
15
+
16
+ @dataclass(slots=True)
17
+ class ConvertTaskProcessor:
18
+ """Run document conversion tasks."""
19
+
20
+ process_file: ConvertProcessor
21
+
22
+ async def process(
23
+ self,
24
+ request: ProcessInput,
25
+ route: InputRoute, # noqa: ARG002
26
+ ) -> ProcessResult:
27
+ input_path = Path(request.input_source)
28
+ text_context, output_dir = await self.process_file(
29
+ input_path.as_posix(), "auto", None, None, request.temp_dir
30
+ )
31
+ return ProcessResult(
32
+ task_type="convert",
33
+ text_context=text_context,
34
+ output_dir=output_dir,
35
+ )
@@ -0,0 +1,92 @@
1
+ """Transcript task processor."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Awaitable, Callable
6
+
7
+ from ....errors import InputNotFoundError
8
+ from ....types import TextContext
9
+ from ....utils import is_url
10
+ from ...models import InputRoute, ProcessInput, ProcessResult
11
+
12
+ TranscriptProcessor = Callable[
13
+ [
14
+ str,
15
+ str,
16
+ str | None,
17
+ str | None,
18
+ Path | None,
19
+ Path | None,
20
+ str | None,
21
+ Path | None,
22
+ bool,
23
+ ],
24
+ Awaitable[tuple[TextContext, str | None]],
25
+ ]
26
+
27
+
28
+ @dataclass(slots=True)
29
+ class TranscriptTaskProcessor:
30
+ """Run transcript tasks for URLs and local audio/video files."""
31
+
32
+ process_url: Callable[
33
+ [
34
+ str,
35
+ str,
36
+ str | None,
37
+ str | None,
38
+ Path | None,
39
+ str | None,
40
+ str | None,
41
+ Path | None,
42
+ bool,
43
+ ],
44
+ Awaitable[tuple[TextContext, str]],
45
+ ]
46
+ process_file: Callable[
47
+ [str, str, str | None, str | None, Path | None],
48
+ Awaitable[tuple[TextContext, Path | None]],
49
+ ]
50
+ resolve_engine: Callable[[str], str]
51
+
52
+ async def process(
53
+ self,
54
+ request: ProcessInput,
55
+ route: InputRoute, # noqa: ARG002
56
+ ) -> ProcessResult:
57
+ if is_url(request.input_source):
58
+ if request.transcribe_engine != "auto":
59
+ self.resolve_engine(request.transcribe_engine)
60
+ text_context, platform = await self.process_url(
61
+ request.input_source,
62
+ request.transcribe_engine,
63
+ request.language,
64
+ request.model,
65
+ request.save_original,
66
+ str(request.cookies) if request.cookies else None,
67
+ request.cookies_from_browser,
68
+ request.temp_dir,
69
+ request.raw_transcript,
70
+ )
71
+ else:
72
+ input_path = Path(request.input_source)
73
+ if not input_path.exists():
74
+ raise InputNotFoundError(
75
+ f"Input file not found: {request.input_source}"
76
+ )
77
+
78
+ resolved_engine = self.resolve_engine(request.transcribe_engine)
79
+ text_context, _ = await self.process_file(
80
+ input_path.as_posix(),
81
+ resolved_engine,
82
+ request.language,
83
+ request.model,
84
+ request.temp_dir,
85
+ )
86
+ platform = None
87
+
88
+ return ProcessResult(
89
+ task_type="transcript",
90
+ text_context=text_context,
91
+ platform=platform,
92
+ )
aimd/cli.py ADDED
@@ -0,0 +1,9 @@
1
+ """CLI entrypoint."""
2
+
3
+ from .adapters.cli.app import app, main
4
+
5
+ __all__ = ["app", "main"]
6
+
7
+
8
+ if __name__ == "__main__":
9
+ main()
aimd/const.py ADDED
@@ -0,0 +1,31 @@
1
+ """Constants used throughout the aimd package."""
2
+
3
+ # =============================================================================
4
+ # FILE PROCESSING CONSTANTS
5
+ # =============================================================================
6
+
7
+ from aimd_media.const import AUDIO_EXTENSIONS
8
+
9
+ # Ebook file extensions (require special handling for image extraction)
10
+ BOOK_EXTENSIONS = {".epub", ".mobi", ".azw3"}
11
+
12
+ MARKITDOWN_FILE_EXTENSIONS = (
13
+ AUDIO_EXTENSIONS
14
+ | BOOK_EXTENSIONS
15
+ | {
16
+ ".csv",
17
+ ".doc",
18
+ ".docx",
19
+ ".html",
20
+ ".htm",
21
+ ".json",
22
+ ".md",
23
+ ".pdf",
24
+ ".ppt",
25
+ ".pptx",
26
+ ".txt",
27
+ ".xls",
28
+ ".xlsx",
29
+ ".xml",
30
+ }
31
+ )
aimd/errors.py ADDED
@@ -0,0 +1,41 @@
1
+ """Domain-level exceptions used across CLI, service, and API layers."""
2
+
3
+
4
+ class AimdError(Exception):
5
+ """Base exception for predictable aimd errors."""
6
+
7
+ status_code = 400
8
+
9
+ def __init__(self, message: str):
10
+ super().__init__(message)
11
+ self.message = message
12
+
13
+
14
+ class UnsupportedInputError(AimdError):
15
+ """Raised when input source is unsupported."""
16
+
17
+ status_code = 400
18
+
19
+
20
+ class UnsupportedEngineError(AimdError):
21
+ """Raised when an unknown transcription engine is requested."""
22
+
23
+ status_code = 400
24
+
25
+
26
+ class EngineUnavailableError(AimdError):
27
+ """Raised when a known engine cannot run on current environment."""
28
+
29
+ status_code = 422
30
+
31
+
32
+ class InputNotFoundError(AimdError):
33
+ """Raised when input path does not exist."""
34
+
35
+ status_code = 404
36
+
37
+
38
+ class ProcessingFailedError(AimdError):
39
+ """Raised when processing fails in a known way."""
40
+
41
+ status_code = 500
@@ -0,0 +1,26 @@
1
+ # packages/aimd/src/aimd/infrastructure
2
+
3
+ Infrastructure layer in the main `aimd` package.
4
+
5
+ ## Responsibilities
6
+
7
+ - MarkItDown runner for local file conversion (`markitdown_processor.py`).
8
+ - Media adapter that wraps `aimd-media` URL extraction results as `TextContext`.
9
+ - Markdown chunking/title helpers for MarkItDown output.
10
+
11
+ ## Model Notes
12
+
13
+ - `aimd_media.const.MLX_AUDIO_MODELS` is a curated allow-list for mlx-audio STT IDs. It includes Qwen3-ASR quantized variants plus newer mlx-audio 0.4.4 STT models such as Whisper, Distil-Whisper, Parakeet, Nemotron ASR, Voxtral, VibeVoice-ASR, and Qwen2-Audio.
14
+ - Do not add mlx-audio forced aligner models to transcription until the product/API accepts reference text and timestamp output.
15
+ - mlx Qwen3-ASR gets a default `Chinese` language hint for existing behavior. Other mlx-audio STT models should receive no language hint unless the caller provided one and the model `generate()` signature accepts `language`.
16
+ - `aimd_media.const.QWEN_ASR_MODELS` tracks official Qwen3-ASR models only: `Qwen/Qwen3-ASR-1.7B` and `Qwen/Qwen3-ASR-0.6B`.
17
+ - Qwen3-ASR upstream supports more languages/dialects than the local `LANGUAGE_CODE_TO_NAME` table exposes; expanding the table is a valid small follow-up if the adapters need those codes.
18
+
19
+ ## Rules
20
+
21
+ - Do not import adapters.
22
+ - Keep modules focused and small by pipeline concern.
23
+ - Raise domain errors from `aimd.errors`.
24
+ - Keep capability checks fail-fast before expensive model work.
25
+ - When using `tempfile.TemporaryDirectory` or `tempfile.NamedTemporaryFile`, always pass the `dir=temp_dir` parameter so callers can redirect temp I/O to a sandbox-safe location via `AIMD_TEMP_DIR`.
26
+ - Do not add compatibility aliases for old internal paths; local file processors should use MarkItDown contracts.
@@ -0,0 +1 @@
1
+ """Infrastructure layer package."""
@@ -0,0 +1,19 @@
1
+ """Document processing infrastructure."""
2
+
3
+ from .chunking import (
4
+ combine_sections_for_processing,
5
+ split_markdown_by_header_level,
6
+ split_markdown_by_headers,
7
+ split_processed_chunk_into_chapters,
8
+ split_text_by_paragraphs,
9
+ )
10
+ from .title_extractor import extract_title_from_content
11
+
12
+ __all__ = [
13
+ "combine_sections_for_processing",
14
+ "extract_title_from_content",
15
+ "split_markdown_by_header_level",
16
+ "split_markdown_by_headers",
17
+ "split_processed_chunk_into_chapters",
18
+ "split_text_by_paragraphs",
19
+ ]
@@ -0,0 +1,168 @@
1
+ """Chunking helpers for markdown document processing."""
2
+
3
+ import re
4
+
5
+ from logly import logger
6
+
7
+
8
+ def combine_sections_for_processing(
9
+ section_data: list[tuple[str | None, str]], max_chunk_size: int = 40000
10
+ ) -> list[str]:
11
+ """Combine multiple sections into larger chunks to reduce API calls."""
12
+ combined_chunks = []
13
+ current_chunk_parts = []
14
+ current_chunk_size = 0
15
+ section_separator = "\n\n" + "=" * 80 + "\n\n"
16
+ separator_size = len(section_separator)
17
+
18
+ for _, content in section_data:
19
+ section_size = len(content)
20
+ would_exceed = (
21
+ current_chunk_size
22
+ + section_size
23
+ + (separator_size if current_chunk_parts else 0)
24
+ ) > max_chunk_size
25
+
26
+ if would_exceed and current_chunk_parts:
27
+ combined_chunks.append(section_separator.join(current_chunk_parts))
28
+ current_chunk_parts = []
29
+ current_chunk_size = 0
30
+
31
+ current_chunk_parts.append(content)
32
+ current_chunk_size += section_size
33
+ if len(current_chunk_parts) > 1:
34
+ current_chunk_size += separator_size
35
+
36
+ if current_chunk_parts:
37
+ combined_chunks.append(section_separator.join(current_chunk_parts))
38
+
39
+ return combined_chunks
40
+
41
+
42
+ def split_text_by_paragraphs(text: str, max_chunk_size: int) -> list[str]:
43
+ """Split text by paragraph boundaries and then hard-wrap oversized blocks."""
44
+ paragraphs = [
45
+ block.strip() for block in re.split(r"\n\s*\n", text) if block.strip()
46
+ ]
47
+ if not paragraphs:
48
+ stripped = text.strip()
49
+ return [stripped] if stripped else []
50
+
51
+ chunks: list[str] = []
52
+ current_parts: list[str] = []
53
+ current_size = 0
54
+
55
+ for paragraph in paragraphs:
56
+ paragraph_len = len(paragraph)
57
+ separator = "\n\n" if current_parts else ""
58
+ projected = current_size + len(separator) + paragraph_len
59
+ if current_parts and projected > max_chunk_size:
60
+ chunks.append("\n\n".join(current_parts))
61
+ current_parts = []
62
+ current_size = 0
63
+
64
+ if paragraph_len <= max_chunk_size:
65
+ current_parts.append(paragraph)
66
+ current_size += (2 if current_size else 0) + paragraph_len
67
+ continue
68
+
69
+ if current_parts:
70
+ chunks.append("\n\n".join(current_parts))
71
+ current_parts = []
72
+ current_size = 0
73
+
74
+ for idx in range(0, paragraph_len, max_chunk_size):
75
+ piece = paragraph[idx : idx + max_chunk_size].strip()
76
+ if piece:
77
+ chunks.append(piece)
78
+
79
+ if current_parts:
80
+ chunks.append("\n\n".join(current_parts))
81
+ return chunks
82
+
83
+
84
+ def split_markdown_by_header_level(
85
+ markdown_content: str,
86
+ header_level: int,
87
+ ) -> list[tuple[str | None, str]]:
88
+ """Split markdown content by a specific header level."""
89
+ header_pattern = f"^{'#' * header_level}\\s+(.+)$"
90
+ lines = markdown_content.split("\n")
91
+
92
+ sections = []
93
+ current_lines = []
94
+ current_title = None
95
+
96
+ def _save_current_section() -> None:
97
+ if current_lines:
98
+ content = "\n".join(current_lines).strip()
99
+ if content:
100
+ sections.append((current_title, content))
101
+
102
+ for line in lines:
103
+ header_match = re.match(header_pattern, line)
104
+
105
+ if header_match:
106
+ _save_current_section()
107
+ current_title = header_match.group(1).strip()
108
+ current_lines = [line]
109
+ else:
110
+ current_lines.append(line)
111
+
112
+ _save_current_section()
113
+ return sections
114
+
115
+
116
+ def split_markdown_by_headers(
117
+ markdown_content: str,
118
+ max_chunk_size: int = 40000,
119
+ ) -> tuple[list[tuple[str | None, str]], int | None]:
120
+ """Split markdown by best-fit header level with paragraph fallback."""
121
+ for split_level in range(1, 7):
122
+ sections = split_markdown_by_header_level(markdown_content, split_level)
123
+ if len(sections) <= 1:
124
+ continue
125
+
126
+ all_under_limit = True
127
+ max_section_size = 0
128
+ for _, section_content in sections:
129
+ section_size = len(section_content)
130
+ max_section_size = max(max_section_size, section_size)
131
+ if section_size > max_chunk_size:
132
+ all_under_limit = False
133
+ break
134
+
135
+ if all_under_limit:
136
+ logger.info(
137
+ f"Using split level {split_level} - all chunks under {max_chunk_size} chars (max: {max_section_size})"
138
+ )
139
+ return sections, split_level
140
+
141
+ fallback_chunks = split_text_by_paragraphs(markdown_content, max_chunk_size)
142
+ return [(None, chunk) for chunk in fallback_chunks], None
143
+
144
+
145
+ def split_processed_chunk_into_chapters(
146
+ processed_content: str,
147
+ header_level: int | None = None,
148
+ ) -> list[tuple[str, str]]:
149
+ """Split processed content back into chapters using stored header level."""
150
+ from .title_extractor import extract_title_from_content
151
+
152
+ if header_level is not None:
153
+ sections = split_markdown_by_header_level(processed_content, header_level)
154
+ chapters = []
155
+ for title, content in sections:
156
+ if content.strip():
157
+ clean_title = extract_title_from_content(
158
+ content,
159
+ title or "Chapter",
160
+ for_filename=True,
161
+ )
162
+ chapters.append((clean_title, content.strip()))
163
+
164
+ if chapters:
165
+ return chapters
166
+
167
+ title = extract_title_from_content(processed_content, "Chapter", for_filename=True)
168
+ return [(title, processed_content.strip())]
@@ -0,0 +1,90 @@
1
+ """Title extraction utilities for markdown content."""
2
+
3
+ import re
4
+
5
+
6
+ def extract_title_from_content(
7
+ content: str, fallback_title: str = "Untitled", for_filename: bool = False
8
+ ) -> str:
9
+ """Extract and clean title from content with unified logic."""
10
+ if not content or not content.strip():
11
+ return fallback_title
12
+
13
+ lines = content.strip().split("\n")
14
+ extracted_title = None
15
+
16
+ for line in lines:
17
+ line = line.strip()
18
+ if line.startswith("# "):
19
+ extracted_title = line[2:].strip()
20
+ break
21
+
22
+ if not extracted_title and content.strip().startswith("---"):
23
+ in_frontmatter = False
24
+ for line in lines:
25
+ if line.strip() == "---":
26
+ if in_frontmatter:
27
+ break
28
+ in_frontmatter = True
29
+ continue
30
+ if in_frontmatter and line.strip().startswith("title:"):
31
+ title_match = re.match(r'title:\s*["\']?([^"\']+)["\']?', line.strip())
32
+ if title_match:
33
+ extracted_title = title_match.group(1).strip()
34
+ break
35
+
36
+ if not extracted_title:
37
+ for i, line in enumerate(lines):
38
+ if i + 1 < len(lines):
39
+ next_line = lines[i + 1].strip()
40
+ if next_line and (
41
+ all(c == "=" for c in next_line) or all(c == "-" for c in next_line)
42
+ ):
43
+ extracted_title = line.strip()
44
+ break
45
+
46
+ if not extracted_title:
47
+ for line in lines[:5]:
48
+ line = line.strip()
49
+ if not line:
50
+ continue
51
+ if (
52
+ line.startswith("![")
53
+ or line.startswith("[]{")
54
+ or line.startswith(":::")
55
+ or line.startswith("<div")
56
+ or line.startswith("</div")
57
+ or "calibre" in line.lower()
58
+ or "kindle-cn" in line.lower()
59
+ ):
60
+ continue
61
+ if (
62
+ len(line) >= 2
63
+ and len(line) <= 100
64
+ and not line.lower().startswith("http")
65
+ ):
66
+ extracted_title = line
67
+ break
68
+
69
+ if not extracted_title:
70
+ return fallback_title
71
+
72
+ clean_text = re.sub(r"^#+\s*", "", extracted_title)
73
+ clean_text = re.sub(r"\*+([^*]+)\*+", r"\1", clean_text)
74
+ clean_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", clean_text)
75
+ clean_text = re.sub(r"\{[^}]*\}", "", clean_text)
76
+ clean_text = re.sub(r"\[\^[^\]]*\](?:\([^)]*\))?", "", clean_text)
77
+ clean_text = re.sub(r"\^[^\]]*\]", "", clean_text)
78
+ clean_text = re.sub(r"#[a-zA-Z0-9_.-]+", "", clean_text)
79
+ clean_text = re.sub(r"\([^)]*\)", "", clean_text)
80
+ clean_text = re.sub(r'^["""\'\']+|["""\'\']+$', "", clean_text)
81
+ clean_text = re.sub(r"\s+", " ", clean_text).strip()
82
+ clean_text = re.sub(r"[。,、;:!?]+$", "", clean_text)
83
+
84
+ if for_filename:
85
+ clean_text = re.sub(r'[<>:"/\\|?*]', "", clean_text)
86
+ clean_text = re.sub(r"\s+", "_", clean_text.strip())
87
+ if len(clean_text) > 50:
88
+ clean_text = clean_text[:50].rstrip("_")
89
+
90
+ return clean_text or fallback_title