aimd-cli 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aimd/AGENTS.md +68 -0
- aimd/__init__.py +13 -0
- aimd/adapters/AGENTS.md +23 -0
- aimd/adapters/__init__.py +1 -0
- aimd/adapters/cli/__init__.py +1 -0
- aimd/adapters/cli/app.py +216 -0
- aimd/application/AGENTS.md +31 -0
- aimd/application/__init__.py +14 -0
- aimd/application/bootstrap.py +51 -0
- aimd/application/models.py +43 -0
- aimd/application/services/__init__.py +1 -0
- aimd/application/services/interface_payloads.py +81 -0
- aimd/application/services/output_writer.py +68 -0
- aimd/application/use_cases/__init__.py +1 -0
- aimd/application/use_cases/input_routing.py +51 -0
- aimd/application/use_cases/list_engines.py +34 -0
- aimd/application/use_cases/process_input.py +40 -0
- aimd/application/use_cases/processors/__init__.py +13 -0
- aimd/application/use_cases/processors/_base.py +17 -0
- aimd/application/use_cases/processors/convert.py +35 -0
- aimd/application/use_cases/processors/transcript.py +92 -0
- aimd/cli.py +9 -0
- aimd/const.py +31 -0
- aimd/errors.py +41 -0
- aimd/infrastructure/AGENTS.md +26 -0
- aimd/infrastructure/__init__.py +1 -0
- aimd/infrastructure/documents/__init__.py +19 -0
- aimd/infrastructure/documents/chunking.py +168 -0
- aimd/infrastructure/documents/title_extractor.py +90 -0
- aimd/infrastructure/markitdown_processor.py +103 -0
- aimd/infrastructure/media_processor.py +51 -0
- aimd/platform_utils.py +26 -0
- aimd/py.typed +0 -0
- aimd/types.py +12 -0
- aimd/utils.py +70 -0
- aimd_cli-0.9.2.dist-info/METADATA +23 -0
- aimd_cli-0.9.2.dist-info/RECORD +39 -0
- aimd_cli-0.9.2.dist-info/WHEEL +4 -0
- aimd_cli-0.9.2.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""MarkItDown-backed local file conversion."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from functools import partial
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from markitdown import MarkItDown
|
|
8
|
+
|
|
9
|
+
from ..const import BOOK_EXTENSIONS, MARKITDOWN_FILE_EXTENSIONS
|
|
10
|
+
from ..errors import InputNotFoundError, UnsupportedInputError
|
|
11
|
+
from ..types import TextContext
|
|
12
|
+
from .documents.chunking import (
|
|
13
|
+
combine_sections_for_processing,
|
|
14
|
+
split_markdown_by_headers,
|
|
15
|
+
)
|
|
16
|
+
from .documents.title_extractor import extract_title_from_content
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def is_supported_file(file_path: str | Path) -> bool:
|
|
20
|
+
"""Return whether a local file extension should be offered to MarkItDown."""
|
|
21
|
+
if isinstance(file_path, str) and file_path.startswith(("http://", "https://")):
|
|
22
|
+
return False
|
|
23
|
+
return Path(file_path).suffix.lower() in MARKITDOWN_FILE_EXTENSIONS
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _text_context_from_markdown(
|
|
27
|
+
markdown: str,
|
|
28
|
+
fallback_title: str,
|
|
29
|
+
title: str | None,
|
|
30
|
+
max_chunk_size: int,
|
|
31
|
+
) -> TextContext:
|
|
32
|
+
"""Convert MarkItDown markdown output into aimd's TextContext shape."""
|
|
33
|
+
resolved_title = title or extract_title_from_content(markdown, fallback_title)
|
|
34
|
+
stripped = markdown.strip()
|
|
35
|
+
if len(stripped) <= max_chunk_size:
|
|
36
|
+
return TextContext(
|
|
37
|
+
title=resolved_title,
|
|
38
|
+
chunk_list=[stripped] if stripped else [],
|
|
39
|
+
split_header_level=None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
sections, header_level = split_markdown_by_headers(
|
|
43
|
+
stripped,
|
|
44
|
+
max_chunk_size=max_chunk_size,
|
|
45
|
+
)
|
|
46
|
+
section_data = [
|
|
47
|
+
(section_title, section_content.strip())
|
|
48
|
+
for section_title, section_content in sections
|
|
49
|
+
if section_content.strip()
|
|
50
|
+
]
|
|
51
|
+
chunks = combine_sections_for_processing(section_data, max_chunk_size)
|
|
52
|
+
return TextContext(
|
|
53
|
+
title=resolved_title,
|
|
54
|
+
chunk_list=chunks,
|
|
55
|
+
split_header_level=header_level,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def convert_file_with_markitdown(
|
|
60
|
+
file_path: str | Path,
|
|
61
|
+
transcribe_engine: str = "auto",
|
|
62
|
+
language: str | None = None,
|
|
63
|
+
model: str | None = None,
|
|
64
|
+
temp_dir: Path | None = None,
|
|
65
|
+
*,
|
|
66
|
+
max_chunk_size: int = 40000,
|
|
67
|
+
) -> tuple[TextContext, Path | None]:
|
|
68
|
+
"""Convert a local file through MarkItDown and installed aimd plugins."""
|
|
69
|
+
input_path = Path(file_path)
|
|
70
|
+
if not input_path.exists():
|
|
71
|
+
raise InputNotFoundError(f"Input file not found: {input_path}")
|
|
72
|
+
if not input_path.is_file():
|
|
73
|
+
raise UnsupportedInputError(f"Path is not a file: {input_path}")
|
|
74
|
+
|
|
75
|
+
suffix = input_path.suffix.lower()
|
|
76
|
+
output_dir = (
|
|
77
|
+
input_path.parent / input_path.stem if suffix in BOOK_EXTENSIONS else None
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
md = MarkItDown(enable_plugins=True)
|
|
81
|
+
loop = asyncio.get_running_loop()
|
|
82
|
+
result = await loop.run_in_executor(
|
|
83
|
+
None,
|
|
84
|
+
partial(
|
|
85
|
+
md.convert,
|
|
86
|
+
input_path,
|
|
87
|
+
transcribe_engine=transcribe_engine,
|
|
88
|
+
language=language,
|
|
89
|
+
model=model,
|
|
90
|
+
temp_dir=temp_dir,
|
|
91
|
+
output_dir=output_dir,
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
markdown = result.markdown
|
|
95
|
+
return (
|
|
96
|
+
_text_context_from_markdown(
|
|
97
|
+
markdown,
|
|
98
|
+
fallback_title=input_path.stem,
|
|
99
|
+
title=result.title,
|
|
100
|
+
max_chunk_size=max_chunk_size,
|
|
101
|
+
),
|
|
102
|
+
output_dir,
|
|
103
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""aimd wrappers around the aimd-media package."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from aimd_media.errors import ProcessingFailedError as MediaProcessingFailedError
|
|
6
|
+
from aimd_media.errors import UnsupportedInputError as MediaUnsupportedInputError
|
|
7
|
+
from aimd_media.url import get_text_from_url
|
|
8
|
+
|
|
9
|
+
from ..errors import ProcessingFailedError, UnsupportedInputError
|
|
10
|
+
from ..types import TextContext
|
|
11
|
+
from .markitdown_processor import _text_context_from_markdown
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def get_text_context_from_media_url(
|
|
15
|
+
url: str,
|
|
16
|
+
transcribe_engine: str = "auto",
|
|
17
|
+
language: str | None = None,
|
|
18
|
+
model: str | None = None,
|
|
19
|
+
save_original_path: Path | None = None,
|
|
20
|
+
cookies_file: str | None = None,
|
|
21
|
+
cookies_from_browser: str | None = None,
|
|
22
|
+
temp_dir: Path | None = None,
|
|
23
|
+
raw_transcript: bool = False,
|
|
24
|
+
) -> tuple[TextContext, str]:
|
|
25
|
+
"""Extract a media URL through aimd-media and wrap it as TextContext."""
|
|
26
|
+
try:
|
|
27
|
+
result = await get_text_from_url(
|
|
28
|
+
url=url,
|
|
29
|
+
transcribe_engine=transcribe_engine,
|
|
30
|
+
language=language,
|
|
31
|
+
model=model,
|
|
32
|
+
save_original_path=save_original_path,
|
|
33
|
+
cookies_file=cookies_file,
|
|
34
|
+
cookies_from_browser=cookies_from_browser,
|
|
35
|
+
temp_dir=temp_dir,
|
|
36
|
+
raw_transcript=raw_transcript,
|
|
37
|
+
)
|
|
38
|
+
except MediaUnsupportedInputError as exc:
|
|
39
|
+
raise UnsupportedInputError(str(exc)) from exc
|
|
40
|
+
except MediaProcessingFailedError as exc:
|
|
41
|
+
raise ProcessingFailedError(str(exc)) from exc
|
|
42
|
+
|
|
43
|
+
return (
|
|
44
|
+
_text_context_from_markdown(
|
|
45
|
+
result.markdown,
|
|
46
|
+
fallback_title=result.title,
|
|
47
|
+
title=result.title,
|
|
48
|
+
max_chunk_size=40000,
|
|
49
|
+
),
|
|
50
|
+
result.platform,
|
|
51
|
+
)
|
aimd/platform_utils.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Platform and hardware detection helpers."""
|
|
2
|
+
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
import platform
|
|
5
|
+
import subprocess
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@lru_cache(maxsize=1)
|
|
9
|
+
def is_apple_silicon() -> bool:
|
|
10
|
+
"""Return True when running on Apple Silicon macOS."""
|
|
11
|
+
if platform.system() != "Darwin":
|
|
12
|
+
return False
|
|
13
|
+
try:
|
|
14
|
+
result = subprocess.run(
|
|
15
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
16
|
+
capture_output=True,
|
|
17
|
+
text=True,
|
|
18
|
+
check=True,
|
|
19
|
+
)
|
|
20
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
cpu_info = result.stdout.strip().lower()
|
|
24
|
+
return "apple" in cpu_info and any(
|
|
25
|
+
chip in cpu_info for chip in ("m1", "m2", "m3", "m4")
|
|
26
|
+
)
|
aimd/py.typed
ADDED
|
File without changes
|
aimd/types.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TextContext(BaseModel):
|
|
5
|
+
"""Context for text processing with title and content."""
|
|
6
|
+
|
|
7
|
+
title: str = Field(..., description="Title of the text")
|
|
8
|
+
chunk_list: list[str] = Field(..., description="List of combined text chunks")
|
|
9
|
+
split_header_level: int | None = Field(
|
|
10
|
+
default=None,
|
|
11
|
+
description="Header level used for splitting (1-6), None if no splitting was done",
|
|
12
|
+
)
|
aimd/utils.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def sanitize_filename(title: str, max_length: int = 100) -> str:
|
|
7
|
+
"""Sanitize title for use as filename.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
title: Title to sanitize
|
|
11
|
+
max_length: Maximum length for filename
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Sanitized filename safe for filesystem
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Remove or replace invalid characters
|
|
18
|
+
sanitized = re.sub(r'[<>:"/\|?*]', "_", title)
|
|
19
|
+
# Remove extra whitespace and replace with underscores
|
|
20
|
+
sanitized = re.sub(r"\s+", "_", sanitized.strip())
|
|
21
|
+
# Remove leading/trailing dots and underscores
|
|
22
|
+
sanitized = sanitized.strip("._")
|
|
23
|
+
# Limit length
|
|
24
|
+
if len(sanitized) > max_length:
|
|
25
|
+
sanitized = sanitized[:max_length].rstrip("._")
|
|
26
|
+
# Ensure we have a valid filename
|
|
27
|
+
if not sanitized:
|
|
28
|
+
sanitized = "output"
|
|
29
|
+
return sanitized
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_output_path_from_title(
|
|
33
|
+
title: str, template_name: str, current_dir: Path = None
|
|
34
|
+
) -> Path:
|
|
35
|
+
"""Create output path using title and template name.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
title: Title from TextContext
|
|
39
|
+
template_name: Template name for suffix
|
|
40
|
+
current_dir: Directory to save file (defaults to current working directory)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Output path with sanitized title and template suffix
|
|
44
|
+
"""
|
|
45
|
+
if current_dir is None:
|
|
46
|
+
current_dir = Path.cwd()
|
|
47
|
+
|
|
48
|
+
sanitized_title = sanitize_filename(title)
|
|
49
|
+
filename = f"{sanitized_title}_{template_name}.md"
|
|
50
|
+
return current_dir / filename
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def is_url(s: str) -> bool:
|
|
54
|
+
"""Check if a string is a URL using basic URL parsing.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
s: String to check
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
True if string appears to be a URL
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
result = urlparse(s)
|
|
64
|
+
# A non-empty scheme and netloc are strong indicators of a URL.
|
|
65
|
+
# We check for scheme presence, and also for netloc to catch schemeless URLs like "www.google.com".
|
|
66
|
+
return all([result.scheme, result.netloc]) or (
|
|
67
|
+
result.scheme in ["http", "https"] and not result.netloc
|
|
68
|
+
)
|
|
69
|
+
except ValueError:
|
|
70
|
+
return False
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: aimd-cli
|
|
3
|
+
Version: 0.9.2
|
|
4
|
+
Summary: Context preparation tool for LLM workflows.
|
|
5
|
+
Author: Shu Li
|
|
6
|
+
Author-email: Shu Li <zetarylee@gmail.com>
|
|
7
|
+
Requires-Dist: aimd-media
|
|
8
|
+
Requires-Dist: aimd-book
|
|
9
|
+
Requires-Dist: logly>=0.1.6
|
|
10
|
+
Requires-Dist: markitdown>=0.1.1,<0.2.0
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: python-dotenv>=1.1.1
|
|
13
|
+
Requires-Dist: rich>=13.9.4
|
|
14
|
+
Requires-Dist: typer>=0.20.0
|
|
15
|
+
Requires-Dist: aimd-api ; extra == 'all'
|
|
16
|
+
Requires-Dist: aimd-mcp ; extra == 'all'
|
|
17
|
+
Requires-Dist: aimd-ocr ; extra == 'all'
|
|
18
|
+
Requires-Dist: aimd-html ; extra == 'all'
|
|
19
|
+
Requires-Python: >=3.10, <3.13
|
|
20
|
+
Provides-Extra: all
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
Prepare LLM-ready context from URLs, audio/video, and documents.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
aimd/AGENTS.md,sha256=qWZdW57zt9dmCADPdDbSjmxfyH7pWMeECtA3PUpuGq0,3954
|
|
2
|
+
aimd/__init__.py,sha256=mKPHHwULavRIzkNJ9htoTO9FO_8TgS__g6E0_BpaD0A,241
|
|
3
|
+
aimd/adapters/AGENTS.md,sha256=l1MEcwqziBKC28oXJsamh2vDTh3sdpKQbHSfbUXy84E,982
|
|
4
|
+
aimd/adapters/__init__.py,sha256=kPCiAvKFPgizgLXLTrC4RW1oZkbMRW5SFLoTnBCJHZA,32
|
|
5
|
+
aimd/adapters/cli/__init__.py,sha256=R6ccDx1BPyu31QO3lmw57_eCke4CkeAZAuEbGLDk96s,27
|
|
6
|
+
aimd/adapters/cli/app.py,sha256=-MS60tmFkGDJtd50CcxE16i2vAVq0br96xt5nlf2gsM,7465
|
|
7
|
+
aimd/application/AGENTS.md,sha256=LAeM0Jm2-HsAXJbNqqNO-wfX-CZ8elGrIXJn7KioUy4,2046
|
|
8
|
+
aimd/application/__init__.py,sha256=fmMLANSxZPj3RKJ4srXjHEDG0ZgDvcTjLeuv1TDpKi0,320
|
|
9
|
+
aimd/application/bootstrap.py,sha256=JJGZK-Il4vo5-cpQqCqZxwi9sa3Xn3GZujdHGTUKI6Q,1650
|
|
10
|
+
aimd/application/models.py,sha256=PGmEkiKaGLYz7EsOneWZioSAfaVR1A3NxNkJX4qcQKA,1122
|
|
11
|
+
aimd/application/services/__init__.py,sha256=7O6qLl42-prqJ_SujBpJffj3T9__V6pLXHu49TEw-E4,35
|
|
12
|
+
aimd/application/services/interface_payloads.py,sha256=IwJViNVoUu06FkVhETQ9lt59RFcIh1lm9LtGYO1Q1vE,2528
|
|
13
|
+
aimd/application/services/output_writer.py,sha256=PCoYRK-B-Ul6JoI1SAA7_YDYVgTc0_XMWbWytALrmUQ,2033
|
|
14
|
+
aimd/application/use_cases/__init__.py,sha256=Tv3oOFjorCm6Etffgl4HsIUrmgtqgc6-c_OLQxGa8Ow,29
|
|
15
|
+
aimd/application/use_cases/input_routing.py,sha256=7gvv03tSmvEz1oNr2AIU3BIOQPxsGwUXxI7dZkDw1G0,1946
|
|
16
|
+
aimd/application/use_cases/list_engines.py,sha256=QIVjRguTA2NKEWlitaBn9AN-BzW6g6_aweSqiTc7rKk,1000
|
|
17
|
+
aimd/application/use_cases/process_input.py,sha256=Q7UzOxmAj8Cl048o_uDpB0GECeEtm6cyzlc9c_FCxHY,1540
|
|
18
|
+
aimd/application/use_cases/processors/__init__.py,sha256=XXYSDDR3lYRxmyRSEw-lsB4xEejc7mVPg_cK0CPFgow,365
|
|
19
|
+
aimd/application/use_cases/processors/_base.py,sha256=1B5BjSzhrP6CNhDqVOcGfwj4dGooQHmOvqmRDAEifzA,415
|
|
20
|
+
aimd/application/use_cases/processors/convert.py,sha256=m5VBEMjmlVUjJDpwQGs477HdHRvne0uSFuNmqHMPRQ4,974
|
|
21
|
+
aimd/application/use_cases/processors/transcript.py,sha256=SMyX4H2nJZNM8V1Wotw-uQdgIgoVy1kIwPQGShgDV5U,2630
|
|
22
|
+
aimd/cli.py,sha256=iJYbgVudhscnwgFV51DgtVXP-uC_OtEnMqwSmMdUBHc,130
|
|
23
|
+
aimd/const.py,sha256=8ptoja8E7lgOdorDqgG3mF12oI81fykHrfhLaiOYxcg,724
|
|
24
|
+
aimd/errors.py,sha256=pjMg0qPUZmly7nvke0MDYoZGp0-Gm2PD3rfbdjHScsw,902
|
|
25
|
+
aimd/infrastructure/AGENTS.md,sha256=OHIS53QxMfo9bABNqsFo6PYfIuqKYrZNPYS5J6VYFaU,1782
|
|
26
|
+
aimd/infrastructure/__init__.py,sha256=D_StCHKRqyvxeKpP-_lrddfbBbJe75K86cT5biZvmcw,36
|
|
27
|
+
aimd/infrastructure/documents/__init__.py,sha256=efTOlhTS6Yl054gdPxPIKlyyRDTgbHiVrPvZkb4-Svk,534
|
|
28
|
+
aimd/infrastructure/documents/chunking.py,sha256=3e-JylggA0qxYlCFGsy2DNC9uSSSrUV_bwiTWtcmGRU,5542
|
|
29
|
+
aimd/infrastructure/documents/title_extractor.py,sha256=9dCjBoP-0PRRiXutTxyEnzMnxGoCbHy6rFZ5KL3W4UE,3215
|
|
30
|
+
aimd/infrastructure/markitdown_processor.py,sha256=K8jajNE27KdMToLLI3LRol_3zh6DuO4sdgeIMuiDRHE,3224
|
|
31
|
+
aimd/infrastructure/media_processor.py,sha256=lDCBJvx0cVL0N17f8OK9CjmXUirdsdnJA4LhqaM-gZ8,1745
|
|
32
|
+
aimd/platform_utils.py,sha256=uM-Mj2fXLpZVoh-r0Ywe8xYNKgo-oEyrD7DqIeBSWDg,713
|
|
33
|
+
aimd/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
|
+
aimd/types.py,sha256=OpedoG-3X5v-xIwKxKH6zxe0cebj6gzI2DafVVQFLNU,441
|
|
35
|
+
aimd/utils.py,sha256=lc0uQQVK3RSyN2fAXLyMU-Psbi3gWQT0lhj8fRFc7vA,2076
|
|
36
|
+
aimd_cli-0.9.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
37
|
+
aimd_cli-0.9.2.dist-info/entry_points.txt,sha256=Q5kTy2HVVf4rZGtsnhXxsabRQ8uX1ZloE0hIacd4fH0,40
|
|
38
|
+
aimd_cli-0.9.2.dist-info/METADATA,sha256=iuO6q7aLUaWhYqYNKTjjIdMU_u24A7Sd6tV5IbwOcIQ,727
|
|
39
|
+
aimd_cli-0.9.2.dist-info/RECORD,,
|