aimd-cli 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aimd/AGENTS.md +68 -0
- aimd/__init__.py +13 -0
- aimd/adapters/AGENTS.md +23 -0
- aimd/adapters/__init__.py +1 -0
- aimd/adapters/cli/__init__.py +1 -0
- aimd/adapters/cli/app.py +216 -0
- aimd/application/AGENTS.md +31 -0
- aimd/application/__init__.py +14 -0
- aimd/application/bootstrap.py +51 -0
- aimd/application/models.py +43 -0
- aimd/application/services/__init__.py +1 -0
- aimd/application/services/interface_payloads.py +81 -0
- aimd/application/services/output_writer.py +68 -0
- aimd/application/use_cases/__init__.py +1 -0
- aimd/application/use_cases/input_routing.py +51 -0
- aimd/application/use_cases/list_engines.py +34 -0
- aimd/application/use_cases/process_input.py +40 -0
- aimd/application/use_cases/processors/__init__.py +13 -0
- aimd/application/use_cases/processors/_base.py +17 -0
- aimd/application/use_cases/processors/convert.py +35 -0
- aimd/application/use_cases/processors/transcript.py +92 -0
- aimd/cli.py +9 -0
- aimd/const.py +31 -0
- aimd/errors.py +41 -0
- aimd/infrastructure/AGENTS.md +26 -0
- aimd/infrastructure/__init__.py +1 -0
- aimd/infrastructure/documents/__init__.py +19 -0
- aimd/infrastructure/documents/chunking.py +168 -0
- aimd/infrastructure/documents/title_extractor.py +90 -0
- aimd/infrastructure/markitdown_processor.py +103 -0
- aimd/infrastructure/media_processor.py +51 -0
- aimd/platform_utils.py +26 -0
- aimd/py.typed +0 -0
- aimd/types.py +12 -0
- aimd/utils.py +70 -0
- aimd_cli-0.9.2.dist-info/METADATA +23 -0
- aimd_cli-0.9.2.dist-info/RECORD +39 -0
- aimd_cli-0.9.2.dist-info/WHEEL +4 -0
- aimd_cli-0.9.2.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Use-case for engine capability introspection."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Callable
|
|
5
|
+
|
|
6
|
+
from aimd_media import EngineCapability
|
|
7
|
+
from aimd_media.errors import EngineUnavailableError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(slots=True)
|
|
11
|
+
class ListEnginesResult:
|
|
12
|
+
auto_selected_engine: str | None
|
|
13
|
+
engines: dict[str, EngineCapability]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(slots=True)
|
|
17
|
+
class ListEnginesUseCase:
|
|
18
|
+
"""List available transcription engines and auto-selected preference."""
|
|
19
|
+
|
|
20
|
+
get_capabilities: Callable[[], dict[str, EngineCapability]]
|
|
21
|
+
resolve_engine: Callable[[str], str]
|
|
22
|
+
|
|
23
|
+
def execute(self) -> ListEnginesResult:
|
|
24
|
+
capabilities = self.get_capabilities()
|
|
25
|
+
auto_selected_engine: str | None = None
|
|
26
|
+
try:
|
|
27
|
+
auto_selected_engine = self.resolve_engine("auto")
|
|
28
|
+
except EngineUnavailableError:
|
|
29
|
+
auto_selected_engine = None
|
|
30
|
+
|
|
31
|
+
return ListEnginesResult(
|
|
32
|
+
auto_selected_engine=auto_selected_engine,
|
|
33
|
+
engines=capabilities,
|
|
34
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Use-case for input processing orchestration."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Mapping
|
|
5
|
+
|
|
6
|
+
from ...errors import InputNotFoundError, ProcessingFailedError, UnsupportedInputError
|
|
7
|
+
from ..models import InputRoute, ProcessInput, ProcessResult, TaskType
|
|
8
|
+
from .input_routing import FileSupportChecker, ensure_supported_input
|
|
9
|
+
from .processors import TaskProcessor
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(slots=True)
|
|
13
|
+
class ProcessInputUseCase:
|
|
14
|
+
"""Core facade/router for processing routed inputs."""
|
|
15
|
+
|
|
16
|
+
processors: Mapping[TaskType, TaskProcessor]
|
|
17
|
+
is_supported_file: FileSupportChecker
|
|
18
|
+
|
|
19
|
+
def ensure_supported_input(self, input_source: str) -> InputRoute:
|
|
20
|
+
"""Validate and return the source/task route for a source."""
|
|
21
|
+
return ensure_supported_input(input_source, self.is_supported_file)
|
|
22
|
+
|
|
23
|
+
async def execute(self, request: ProcessInput) -> ProcessResult:
|
|
24
|
+
route = self.ensure_supported_input(request.input_source)
|
|
25
|
+
task_type = route.task_type
|
|
26
|
+
if task_type is None:
|
|
27
|
+
raise UnsupportedInputError("Unsupported input source.")
|
|
28
|
+
|
|
29
|
+
processor = self.processors.get(task_type)
|
|
30
|
+
if processor is None:
|
|
31
|
+
raise UnsupportedInputError(
|
|
32
|
+
f"No processor configured for task: {task_type}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
return await processor.process(request, route)
|
|
37
|
+
except (InputNotFoundError, UnsupportedInputError, ProcessingFailedError):
|
|
38
|
+
raise
|
|
39
|
+
except Exception as exc:
|
|
40
|
+
raise ProcessingFailedError(str(exc)) from exc
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Task processors used by the process input facade."""
|
|
2
|
+
|
|
3
|
+
from ._base import TaskProcessor
|
|
4
|
+
from .convert import ConvertProcessor, ConvertTaskProcessor
|
|
5
|
+
from .transcript import TranscriptProcessor, TranscriptTaskProcessor
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ConvertProcessor",
|
|
9
|
+
"ConvertTaskProcessor",
|
|
10
|
+
"TaskProcessor",
|
|
11
|
+
"TranscriptProcessor",
|
|
12
|
+
"TranscriptTaskProcessor",
|
|
13
|
+
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Base task processor contract."""
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from ...models import InputRoute, ProcessInput, ProcessResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TaskProcessor(Protocol):
|
|
9
|
+
"""Process one routed task into a canonical result."""
|
|
10
|
+
|
|
11
|
+
async def process(
|
|
12
|
+
self,
|
|
13
|
+
request: ProcessInput,
|
|
14
|
+
route: InputRoute,
|
|
15
|
+
) -> ProcessResult:
|
|
16
|
+
"""Run task-specific processing for a routed input."""
|
|
17
|
+
...
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Document conversion task processor."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Awaitable, Callable
|
|
6
|
+
|
|
7
|
+
from ....types import TextContext
|
|
8
|
+
from ...models import InputRoute, ProcessInput, ProcessResult
|
|
9
|
+
|
|
10
|
+
ConvertProcessor = Callable[
|
|
11
|
+
[str, str, str | None, str | None, Path | None],
|
|
12
|
+
Awaitable[tuple[TextContext, Path | None]],
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(slots=True)
|
|
17
|
+
class ConvertTaskProcessor:
|
|
18
|
+
"""Run document conversion tasks."""
|
|
19
|
+
|
|
20
|
+
process_file: ConvertProcessor
|
|
21
|
+
|
|
22
|
+
async def process(
|
|
23
|
+
self,
|
|
24
|
+
request: ProcessInput,
|
|
25
|
+
route: InputRoute, # noqa: ARG002
|
|
26
|
+
) -> ProcessResult:
|
|
27
|
+
input_path = Path(request.input_source)
|
|
28
|
+
text_context, output_dir = await self.process_file(
|
|
29
|
+
input_path.as_posix(), "auto", None, None, request.temp_dir
|
|
30
|
+
)
|
|
31
|
+
return ProcessResult(
|
|
32
|
+
task_type="convert",
|
|
33
|
+
text_context=text_context,
|
|
34
|
+
output_dir=output_dir,
|
|
35
|
+
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Transcript task processor."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Awaitable, Callable
|
|
6
|
+
|
|
7
|
+
from ....errors import InputNotFoundError
|
|
8
|
+
from ....types import TextContext
|
|
9
|
+
from ....utils import is_url
|
|
10
|
+
from ...models import InputRoute, ProcessInput, ProcessResult
|
|
11
|
+
|
|
12
|
+
TranscriptProcessor = Callable[
|
|
13
|
+
[
|
|
14
|
+
str,
|
|
15
|
+
str,
|
|
16
|
+
str | None,
|
|
17
|
+
str | None,
|
|
18
|
+
Path | None,
|
|
19
|
+
Path | None,
|
|
20
|
+
str | None,
|
|
21
|
+
Path | None,
|
|
22
|
+
bool,
|
|
23
|
+
],
|
|
24
|
+
Awaitable[tuple[TextContext, str | None]],
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(slots=True)
|
|
29
|
+
class TranscriptTaskProcessor:
|
|
30
|
+
"""Run transcript tasks for URLs and local audio/video files."""
|
|
31
|
+
|
|
32
|
+
process_url: Callable[
|
|
33
|
+
[
|
|
34
|
+
str,
|
|
35
|
+
str,
|
|
36
|
+
str | None,
|
|
37
|
+
str | None,
|
|
38
|
+
Path | None,
|
|
39
|
+
str | None,
|
|
40
|
+
str | None,
|
|
41
|
+
Path | None,
|
|
42
|
+
bool,
|
|
43
|
+
],
|
|
44
|
+
Awaitable[tuple[TextContext, str]],
|
|
45
|
+
]
|
|
46
|
+
process_file: Callable[
|
|
47
|
+
[str, str, str | None, str | None, Path | None],
|
|
48
|
+
Awaitable[tuple[TextContext, Path | None]],
|
|
49
|
+
]
|
|
50
|
+
resolve_engine: Callable[[str], str]
|
|
51
|
+
|
|
52
|
+
async def process(
|
|
53
|
+
self,
|
|
54
|
+
request: ProcessInput,
|
|
55
|
+
route: InputRoute, # noqa: ARG002
|
|
56
|
+
) -> ProcessResult:
|
|
57
|
+
if is_url(request.input_source):
|
|
58
|
+
if request.transcribe_engine != "auto":
|
|
59
|
+
self.resolve_engine(request.transcribe_engine)
|
|
60
|
+
text_context, platform = await self.process_url(
|
|
61
|
+
request.input_source,
|
|
62
|
+
request.transcribe_engine,
|
|
63
|
+
request.language,
|
|
64
|
+
request.model,
|
|
65
|
+
request.save_original,
|
|
66
|
+
str(request.cookies) if request.cookies else None,
|
|
67
|
+
request.cookies_from_browser,
|
|
68
|
+
request.temp_dir,
|
|
69
|
+
request.raw_transcript,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
input_path = Path(request.input_source)
|
|
73
|
+
if not input_path.exists():
|
|
74
|
+
raise InputNotFoundError(
|
|
75
|
+
f"Input file not found: {request.input_source}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
resolved_engine = self.resolve_engine(request.transcribe_engine)
|
|
79
|
+
text_context, _ = await self.process_file(
|
|
80
|
+
input_path.as_posix(),
|
|
81
|
+
resolved_engine,
|
|
82
|
+
request.language,
|
|
83
|
+
request.model,
|
|
84
|
+
request.temp_dir,
|
|
85
|
+
)
|
|
86
|
+
platform = None
|
|
87
|
+
|
|
88
|
+
return ProcessResult(
|
|
89
|
+
task_type="transcript",
|
|
90
|
+
text_context=text_context,
|
|
91
|
+
platform=platform,
|
|
92
|
+
)
|
aimd/cli.py
ADDED
aimd/const.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Constants used throughout the aimd package."""
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# FILE PROCESSING CONSTANTS
|
|
5
|
+
# =============================================================================
|
|
6
|
+
|
|
7
|
+
from aimd_media.const import AUDIO_EXTENSIONS
|
|
8
|
+
|
|
9
|
+
# Ebook file extensions (require special handling for image extraction)
|
|
10
|
+
BOOK_EXTENSIONS = {".epub", ".mobi", ".azw3"}
|
|
11
|
+
|
|
12
|
+
MARKITDOWN_FILE_EXTENSIONS = (
|
|
13
|
+
AUDIO_EXTENSIONS
|
|
14
|
+
| BOOK_EXTENSIONS
|
|
15
|
+
| {
|
|
16
|
+
".csv",
|
|
17
|
+
".doc",
|
|
18
|
+
".docx",
|
|
19
|
+
".html",
|
|
20
|
+
".htm",
|
|
21
|
+
".json",
|
|
22
|
+
".md",
|
|
23
|
+
".pdf",
|
|
24
|
+
".ppt",
|
|
25
|
+
".pptx",
|
|
26
|
+
".txt",
|
|
27
|
+
".xls",
|
|
28
|
+
".xlsx",
|
|
29
|
+
".xml",
|
|
30
|
+
}
|
|
31
|
+
)
|
aimd/errors.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Domain-level exceptions used across CLI, service, and API layers."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class AimdError(Exception):
|
|
5
|
+
"""Base exception for predictable aimd errors."""
|
|
6
|
+
|
|
7
|
+
status_code = 400
|
|
8
|
+
|
|
9
|
+
def __init__(self, message: str):
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
self.message = message
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UnsupportedInputError(AimdError):
|
|
15
|
+
"""Raised when input source is unsupported."""
|
|
16
|
+
|
|
17
|
+
status_code = 400
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class UnsupportedEngineError(AimdError):
|
|
21
|
+
"""Raised when an unknown transcription engine is requested."""
|
|
22
|
+
|
|
23
|
+
status_code = 400
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EngineUnavailableError(AimdError):
|
|
27
|
+
"""Raised when a known engine cannot run on current environment."""
|
|
28
|
+
|
|
29
|
+
status_code = 422
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class InputNotFoundError(AimdError):
|
|
33
|
+
"""Raised when input path does not exist."""
|
|
34
|
+
|
|
35
|
+
status_code = 404
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ProcessingFailedError(AimdError):
|
|
39
|
+
"""Raised when processing fails in a known way."""
|
|
40
|
+
|
|
41
|
+
status_code = 500
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# packages/aimd/src/aimd/infrastructure
|
|
2
|
+
|
|
3
|
+
Infrastructure layer in the main `aimd` package.
|
|
4
|
+
|
|
5
|
+
## Responsibilities
|
|
6
|
+
|
|
7
|
+
- MarkItDown runner for local file conversion (`markitdown_processor.py`).
|
|
8
|
+
- Media adapter that wraps `aimd-media` URL extraction results as `TextContext`.
|
|
9
|
+
- Markdown chunking/title helpers for MarkItDown output.
|
|
10
|
+
|
|
11
|
+
## Model Notes
|
|
12
|
+
|
|
13
|
+
- `aimd_media.const.MLX_AUDIO_MODELS` is a curated allow-list for mlx-audio STT IDs. It includes Qwen3-ASR quantized variants plus newer mlx-audio 0.4.4 STT models such as Whisper, Distil-Whisper, Parakeet, Nemotron ASR, Voxtral, VibeVoice-ASR, and Qwen2-Audio.
|
|
14
|
+
- Do not add mlx-audio forced aligner models to transcription until the product/API accepts reference text and timestamp output.
|
|
15
|
+
- mlx Qwen3-ASR gets a default `Chinese` language hint for existing behavior. Other mlx-audio STT models should receive no language hint unless the caller provided one and the model `generate()` signature accepts `language`.
|
|
16
|
+
- `aimd_media.const.QWEN_ASR_MODELS` tracks official Qwen3-ASR models only: `Qwen/Qwen3-ASR-1.7B` and `Qwen/Qwen3-ASR-0.6B`.
|
|
17
|
+
- Qwen3-ASR upstream supports more languages/dialects than the local `LANGUAGE_CODE_TO_NAME` table exposes; expanding the table is a valid small follow-up if the adapters need those codes.
|
|
18
|
+
|
|
19
|
+
## Rules
|
|
20
|
+
|
|
21
|
+
- Do not import adapters.
|
|
22
|
+
- Keep modules focused and small by pipeline concern.
|
|
23
|
+
- Raise domain errors from `aimd.errors`.
|
|
24
|
+
- Keep capability checks fail-fast before expensive model work.
|
|
25
|
+
- When using `tempfile.TemporaryDirectory` or `tempfile.NamedTemporaryFile`, always pass the `dir=temp_dir` parameter so callers can redirect temp I/O to a sandbox-safe location via `AIMD_TEMP_DIR`.
|
|
26
|
+
- Do not add compatibility aliases for old internal paths; local file processors should use MarkItDown contracts.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Infrastructure layer package."""
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Document processing infrastructure."""
|
|
2
|
+
|
|
3
|
+
from .chunking import (
|
|
4
|
+
combine_sections_for_processing,
|
|
5
|
+
split_markdown_by_header_level,
|
|
6
|
+
split_markdown_by_headers,
|
|
7
|
+
split_processed_chunk_into_chapters,
|
|
8
|
+
split_text_by_paragraphs,
|
|
9
|
+
)
|
|
10
|
+
from .title_extractor import extract_title_from_content
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"combine_sections_for_processing",
|
|
14
|
+
"extract_title_from_content",
|
|
15
|
+
"split_markdown_by_header_level",
|
|
16
|
+
"split_markdown_by_headers",
|
|
17
|
+
"split_processed_chunk_into_chapters",
|
|
18
|
+
"split_text_by_paragraphs",
|
|
19
|
+
]
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Chunking helpers for markdown document processing."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from logly import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def combine_sections_for_processing(
|
|
9
|
+
section_data: list[tuple[str | None, str]], max_chunk_size: int = 40000
|
|
10
|
+
) -> list[str]:
|
|
11
|
+
"""Combine multiple sections into larger chunks to reduce API calls."""
|
|
12
|
+
combined_chunks = []
|
|
13
|
+
current_chunk_parts = []
|
|
14
|
+
current_chunk_size = 0
|
|
15
|
+
section_separator = "\n\n" + "=" * 80 + "\n\n"
|
|
16
|
+
separator_size = len(section_separator)
|
|
17
|
+
|
|
18
|
+
for _, content in section_data:
|
|
19
|
+
section_size = len(content)
|
|
20
|
+
would_exceed = (
|
|
21
|
+
current_chunk_size
|
|
22
|
+
+ section_size
|
|
23
|
+
+ (separator_size if current_chunk_parts else 0)
|
|
24
|
+
) > max_chunk_size
|
|
25
|
+
|
|
26
|
+
if would_exceed and current_chunk_parts:
|
|
27
|
+
combined_chunks.append(section_separator.join(current_chunk_parts))
|
|
28
|
+
current_chunk_parts = []
|
|
29
|
+
current_chunk_size = 0
|
|
30
|
+
|
|
31
|
+
current_chunk_parts.append(content)
|
|
32
|
+
current_chunk_size += section_size
|
|
33
|
+
if len(current_chunk_parts) > 1:
|
|
34
|
+
current_chunk_size += separator_size
|
|
35
|
+
|
|
36
|
+
if current_chunk_parts:
|
|
37
|
+
combined_chunks.append(section_separator.join(current_chunk_parts))
|
|
38
|
+
|
|
39
|
+
return combined_chunks
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def split_text_by_paragraphs(text: str, max_chunk_size: int) -> list[str]:
|
|
43
|
+
"""Split text by paragraph boundaries and then hard-wrap oversized blocks."""
|
|
44
|
+
paragraphs = [
|
|
45
|
+
block.strip() for block in re.split(r"\n\s*\n", text) if block.strip()
|
|
46
|
+
]
|
|
47
|
+
if not paragraphs:
|
|
48
|
+
stripped = text.strip()
|
|
49
|
+
return [stripped] if stripped else []
|
|
50
|
+
|
|
51
|
+
chunks: list[str] = []
|
|
52
|
+
current_parts: list[str] = []
|
|
53
|
+
current_size = 0
|
|
54
|
+
|
|
55
|
+
for paragraph in paragraphs:
|
|
56
|
+
paragraph_len = len(paragraph)
|
|
57
|
+
separator = "\n\n" if current_parts else ""
|
|
58
|
+
projected = current_size + len(separator) + paragraph_len
|
|
59
|
+
if current_parts and projected > max_chunk_size:
|
|
60
|
+
chunks.append("\n\n".join(current_parts))
|
|
61
|
+
current_parts = []
|
|
62
|
+
current_size = 0
|
|
63
|
+
|
|
64
|
+
if paragraph_len <= max_chunk_size:
|
|
65
|
+
current_parts.append(paragraph)
|
|
66
|
+
current_size += (2 if current_size else 0) + paragraph_len
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
if current_parts:
|
|
70
|
+
chunks.append("\n\n".join(current_parts))
|
|
71
|
+
current_parts = []
|
|
72
|
+
current_size = 0
|
|
73
|
+
|
|
74
|
+
for idx in range(0, paragraph_len, max_chunk_size):
|
|
75
|
+
piece = paragraph[idx : idx + max_chunk_size].strip()
|
|
76
|
+
if piece:
|
|
77
|
+
chunks.append(piece)
|
|
78
|
+
|
|
79
|
+
if current_parts:
|
|
80
|
+
chunks.append("\n\n".join(current_parts))
|
|
81
|
+
return chunks
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def split_markdown_by_header_level(
|
|
85
|
+
markdown_content: str,
|
|
86
|
+
header_level: int,
|
|
87
|
+
) -> list[tuple[str | None, str]]:
|
|
88
|
+
"""Split markdown content by a specific header level."""
|
|
89
|
+
header_pattern = f"^{'#' * header_level}\\s+(.+)$"
|
|
90
|
+
lines = markdown_content.split("\n")
|
|
91
|
+
|
|
92
|
+
sections = []
|
|
93
|
+
current_lines = []
|
|
94
|
+
current_title = None
|
|
95
|
+
|
|
96
|
+
def _save_current_section() -> None:
|
|
97
|
+
if current_lines:
|
|
98
|
+
content = "\n".join(current_lines).strip()
|
|
99
|
+
if content:
|
|
100
|
+
sections.append((current_title, content))
|
|
101
|
+
|
|
102
|
+
for line in lines:
|
|
103
|
+
header_match = re.match(header_pattern, line)
|
|
104
|
+
|
|
105
|
+
if header_match:
|
|
106
|
+
_save_current_section()
|
|
107
|
+
current_title = header_match.group(1).strip()
|
|
108
|
+
current_lines = [line]
|
|
109
|
+
else:
|
|
110
|
+
current_lines.append(line)
|
|
111
|
+
|
|
112
|
+
_save_current_section()
|
|
113
|
+
return sections
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def split_markdown_by_headers(
|
|
117
|
+
markdown_content: str,
|
|
118
|
+
max_chunk_size: int = 40000,
|
|
119
|
+
) -> tuple[list[tuple[str | None, str]], int | None]:
|
|
120
|
+
"""Split markdown by best-fit header level with paragraph fallback."""
|
|
121
|
+
for split_level in range(1, 7):
|
|
122
|
+
sections = split_markdown_by_header_level(markdown_content, split_level)
|
|
123
|
+
if len(sections) <= 1:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
all_under_limit = True
|
|
127
|
+
max_section_size = 0
|
|
128
|
+
for _, section_content in sections:
|
|
129
|
+
section_size = len(section_content)
|
|
130
|
+
max_section_size = max(max_section_size, section_size)
|
|
131
|
+
if section_size > max_chunk_size:
|
|
132
|
+
all_under_limit = False
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
if all_under_limit:
|
|
136
|
+
logger.info(
|
|
137
|
+
f"Using split level {split_level} - all chunks under {max_chunk_size} chars (max: {max_section_size})"
|
|
138
|
+
)
|
|
139
|
+
return sections, split_level
|
|
140
|
+
|
|
141
|
+
fallback_chunks = split_text_by_paragraphs(markdown_content, max_chunk_size)
|
|
142
|
+
return [(None, chunk) for chunk in fallback_chunks], None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def split_processed_chunk_into_chapters(
|
|
146
|
+
processed_content: str,
|
|
147
|
+
header_level: int | None = None,
|
|
148
|
+
) -> list[tuple[str, str]]:
|
|
149
|
+
"""Split processed content back into chapters using stored header level."""
|
|
150
|
+
from .title_extractor import extract_title_from_content
|
|
151
|
+
|
|
152
|
+
if header_level is not None:
|
|
153
|
+
sections = split_markdown_by_header_level(processed_content, header_level)
|
|
154
|
+
chapters = []
|
|
155
|
+
for title, content in sections:
|
|
156
|
+
if content.strip():
|
|
157
|
+
clean_title = extract_title_from_content(
|
|
158
|
+
content,
|
|
159
|
+
title or "Chapter",
|
|
160
|
+
for_filename=True,
|
|
161
|
+
)
|
|
162
|
+
chapters.append((clean_title, content.strip()))
|
|
163
|
+
|
|
164
|
+
if chapters:
|
|
165
|
+
return chapters
|
|
166
|
+
|
|
167
|
+
title = extract_title_from_content(processed_content, "Chapter", for_filename=True)
|
|
168
|
+
return [(title, processed_content.strip())]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Title extraction utilities for markdown content."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_title_from_content(
|
|
7
|
+
content: str, fallback_title: str = "Untitled", for_filename: bool = False
|
|
8
|
+
) -> str:
|
|
9
|
+
"""Extract and clean title from content with unified logic."""
|
|
10
|
+
if not content or not content.strip():
|
|
11
|
+
return fallback_title
|
|
12
|
+
|
|
13
|
+
lines = content.strip().split("\n")
|
|
14
|
+
extracted_title = None
|
|
15
|
+
|
|
16
|
+
for line in lines:
|
|
17
|
+
line = line.strip()
|
|
18
|
+
if line.startswith("# "):
|
|
19
|
+
extracted_title = line[2:].strip()
|
|
20
|
+
break
|
|
21
|
+
|
|
22
|
+
if not extracted_title and content.strip().startswith("---"):
|
|
23
|
+
in_frontmatter = False
|
|
24
|
+
for line in lines:
|
|
25
|
+
if line.strip() == "---":
|
|
26
|
+
if in_frontmatter:
|
|
27
|
+
break
|
|
28
|
+
in_frontmatter = True
|
|
29
|
+
continue
|
|
30
|
+
if in_frontmatter and line.strip().startswith("title:"):
|
|
31
|
+
title_match = re.match(r'title:\s*["\']?([^"\']+)["\']?', line.strip())
|
|
32
|
+
if title_match:
|
|
33
|
+
extracted_title = title_match.group(1).strip()
|
|
34
|
+
break
|
|
35
|
+
|
|
36
|
+
if not extracted_title:
|
|
37
|
+
for i, line in enumerate(lines):
|
|
38
|
+
if i + 1 < len(lines):
|
|
39
|
+
next_line = lines[i + 1].strip()
|
|
40
|
+
if next_line and (
|
|
41
|
+
all(c == "=" for c in next_line) or all(c == "-" for c in next_line)
|
|
42
|
+
):
|
|
43
|
+
extracted_title = line.strip()
|
|
44
|
+
break
|
|
45
|
+
|
|
46
|
+
if not extracted_title:
|
|
47
|
+
for line in lines[:5]:
|
|
48
|
+
line = line.strip()
|
|
49
|
+
if not line:
|
|
50
|
+
continue
|
|
51
|
+
if (
|
|
52
|
+
line.startswith("![")
|
|
53
|
+
or line.startswith("[]{")
|
|
54
|
+
or line.startswith(":::")
|
|
55
|
+
or line.startswith("<div")
|
|
56
|
+
or line.startswith("</div")
|
|
57
|
+
or "calibre" in line.lower()
|
|
58
|
+
or "kindle-cn" in line.lower()
|
|
59
|
+
):
|
|
60
|
+
continue
|
|
61
|
+
if (
|
|
62
|
+
len(line) >= 2
|
|
63
|
+
and len(line) <= 100
|
|
64
|
+
and not line.lower().startswith("http")
|
|
65
|
+
):
|
|
66
|
+
extracted_title = line
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
if not extracted_title:
|
|
70
|
+
return fallback_title
|
|
71
|
+
|
|
72
|
+
clean_text = re.sub(r"^#+\s*", "", extracted_title)
|
|
73
|
+
clean_text = re.sub(r"\*+([^*]+)\*+", r"\1", clean_text)
|
|
74
|
+
clean_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", clean_text)
|
|
75
|
+
clean_text = re.sub(r"\{[^}]*\}", "", clean_text)
|
|
76
|
+
clean_text = re.sub(r"\[\^[^\]]*\](?:\([^)]*\))?", "", clean_text)
|
|
77
|
+
clean_text = re.sub(r"\^[^\]]*\]", "", clean_text)
|
|
78
|
+
clean_text = re.sub(r"#[a-zA-Z0-9_.-]+", "", clean_text)
|
|
79
|
+
clean_text = re.sub(r"\([^)]*\)", "", clean_text)
|
|
80
|
+
clean_text = re.sub(r'^["""\'\']+|["""\'\']+$', "", clean_text)
|
|
81
|
+
clean_text = re.sub(r"\s+", " ", clean_text).strip()
|
|
82
|
+
clean_text = re.sub(r"[。,、;:!?]+$", "", clean_text)
|
|
83
|
+
|
|
84
|
+
if for_filename:
|
|
85
|
+
clean_text = re.sub(r'[<>:"/\\|?*]', "", clean_text)
|
|
86
|
+
clean_text = re.sub(r"\s+", "_", clean_text.strip())
|
|
87
|
+
if len(clean_text) > 50:
|
|
88
|
+
clean_text = clean_text[:50].rstrip("_")
|
|
89
|
+
|
|
90
|
+
return clean_text or fallback_title
|