markitai 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {markitai-0.3.0 → markitai-0.3.1}/PKG-INFO +49 -47
- markitai-0.3.1/README.md +121 -0
- {markitai-0.3.0 → markitai-0.3.1}/pyproject.toml +2 -1
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/__init__.py +1 -1
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/batch.py +9 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/cli.py +110 -8
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/config.py +32 -9
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/config.schema.json +85 -81
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/constants.py +2 -2
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/_patches.py +1 -1
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/pdf.py +6 -2
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/fetch.py +1028 -148
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/image.py +145 -3
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/llm.py +292 -157
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/ocr.py +103 -32
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/prompts/__init__.py +27 -11
- markitai-0.3.1/src/markitai/prompts/cleaner_system.md +33 -0
- markitai-0.3.1/src/markitai/prompts/cleaner_user.md +3 -0
- markitai-0.3.0/src/markitai/prompts/document_enhance_complete.md → markitai-0.3.1/src/markitai/prompts/document_enhance_complete_system.md +17 -12
- markitai-0.3.1/src/markitai/prompts/document_enhance_complete_user.md +5 -0
- markitai-0.3.0/src/markitai/prompts/document_enhance.md → markitai-0.3.1/src/markitai/prompts/document_enhance_system.md +8 -7
- markitai-0.3.1/src/markitai/prompts/document_enhance_user.md +5 -0
- markitai-0.3.1/src/markitai/prompts/document_process_system.md +39 -0
- markitai-0.3.1/src/markitai/prompts/document_process_user.md +5 -0
- markitai-0.3.1/src/markitai/prompts/frontmatter_system.md +24 -0
- markitai-0.3.1/src/markitai/prompts/frontmatter_user.md +5 -0
- markitai-0.3.0/src/markitai/prompts/image_analysis.md → markitai-0.3.1/src/markitai/prompts/image_analysis_system.md +10 -4
- markitai-0.3.1/src/markitai/prompts/image_analysis_user.md +1 -0
- markitai-0.3.0/src/markitai/prompts/image_caption.md → markitai-0.3.1/src/markitai/prompts/image_caption_system.md +6 -2
- markitai-0.3.1/src/markitai/prompts/image_caption_user.md +1 -0
- markitai-0.3.0/src/markitai/prompts/image_description.md → markitai-0.3.1/src/markitai/prompts/image_description_system.md +6 -3
- markitai-0.3.1/src/markitai/prompts/image_description_user.md +1 -0
- markitai-0.3.0/src/markitai/prompts/page_content.md → markitai-0.3.1/src/markitai/prompts/page_content_system.md +5 -2
- markitai-0.3.1/src/markitai/prompts/page_content_user.md +1 -0
- markitai-0.3.0/src/markitai/prompts/url_enhance.md → markitai-0.3.1/src/markitai/prompts/url_enhance_system.md +17 -7
- markitai-0.3.1/src/markitai/prompts/url_enhance_user.md +5 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/executor.py +18 -1
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/text.py +35 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/core.py +6 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/single.py +6 -3
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/sub_dir/file_example_PPT_250kB.ppt +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/sub_dir/file_example_XLS_100.xls +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_executor.py +51 -0
- markitai-0.3.1/tests/unit/test_fetch.py +789 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_image.py +214 -1
- markitai-0.3.1/tests/unit/test_ocr.py +355 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_prompts.py +46 -35
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_schema_sync.py +11 -13
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_workflow_core.py +26 -0
- markitai-0.3.0/README.md +0 -120
- markitai-0.3.0/src/markitai/prompts/cleaner.md +0 -93
- markitai-0.3.0/src/markitai/prompts/document_process.md +0 -60
- markitai-0.3.0/src/markitai/prompts/frontmatter.md +0 -28
- markitai-0.3.0/tests/unit/test_fetch.py +0 -360
- markitai-0.3.0/tests/unit/test_ocr.py +0 -209
- {markitai-0.3.0 → markitai-0.3.1}/.gitignore +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/__init__.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/base.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/image.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/legacy.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/office.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/text.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/json_order.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/security.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/types.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/urls.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/__init__.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/mime.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/office.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/output.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/paths.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/__init__.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/helpers.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/SKILL.md +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/__init__.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/conftest.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/Free_Test_Data_500KB_PPTX.pptx +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/candy.JPG +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/file-example_PDF_500_kB.pdf +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/file_example_XLSX_100.xlsx +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/sub_dir/file-sample_100kB.doc +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/test.urls +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/integration/__init__.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_cache.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_cli.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_output_format.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_url.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/__init__.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_atomic.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_batch.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_cli_helpers.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_config.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_converter.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_image_converter.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_json_order.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_llm.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_llm_runtime.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_security.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_workflow_helpers.py +0 -0
- {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_workflow_single.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: markitai
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Document to Markdown converter with LLM enhancement
|
|
5
5
|
Project-URL: Homepage, https://markitai.ynewtime.com
|
|
6
6
|
Project-URL: Documentation, https://markitai.ynewtime.com/guide/getting-started
|
|
@@ -27,6 +27,7 @@ Requires-Dist: instructor>=1.14.0
|
|
|
27
27
|
Requires-Dist: litellm>=1.80.16
|
|
28
28
|
Requires-Dist: loguru>=0.7.3
|
|
29
29
|
Requires-Dist: markitdown[all]>=0.1.4
|
|
30
|
+
Requires-Dist: opencv-python>=4.8.0
|
|
30
31
|
Requires-Dist: pillow>=12.1.0
|
|
31
32
|
Requires-Dist: pydantic>=2.10.0
|
|
32
33
|
Requires-Dist: pymupdf4llm>=0.2.9
|
|
@@ -39,19 +40,20 @@ Description-Content-Type: text/markdown
|
|
|
39
40
|
|
|
40
41
|
# Markitai
|
|
41
42
|
|
|
42
|
-
|
|
43
|
+
Opinionated Markdown converter with native LLM enhancement support.
|
|
43
44
|
|
|
44
|
-
##
|
|
45
|
+
## Features
|
|
45
46
|
|
|
46
|
-
-
|
|
47
|
-
- **LLM
|
|
48
|
-
-
|
|
49
|
-
- **OCR
|
|
50
|
-
- **URL
|
|
47
|
+
- **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
|
|
48
|
+
- **LLM Enhancement** - Format cleaning, metadata generation, image analysis
|
|
49
|
+
- **Batch Processing** - Concurrent conversion, resume capability, progress display
|
|
50
|
+
- **OCR Recognition** - Text extraction from scanned PDFs and images
|
|
51
|
+
- **URL Conversion** - Direct webpage conversion with SPA browser rendering support
|
|
52
|
+
- **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
|
|
51
53
|
|
|
52
|
-
##
|
|
54
|
+
## Installation
|
|
53
55
|
|
|
54
|
-
###
|
|
56
|
+
### One-Click Setup (Recommended)
|
|
55
57
|
|
|
56
58
|
```bash
|
|
57
59
|
# Linux/macOS
|
|
@@ -61,98 +63,98 @@ curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setu
|
|
|
61
63
|
irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
|
|
62
64
|
```
|
|
63
65
|
|
|
64
|
-
###
|
|
66
|
+
### Manual Installation
|
|
65
67
|
|
|
66
68
|
```bash
|
|
67
|
-
#
|
|
69
|
+
# Requires Python 3.11+
|
|
68
70
|
uv tool install markitai
|
|
69
71
|
|
|
70
|
-
#
|
|
72
|
+
# Or using pip
|
|
71
73
|
pip install --user markitai
|
|
72
74
|
```
|
|
73
75
|
|
|
74
|
-
##
|
|
76
|
+
## Quick Start
|
|
75
77
|
|
|
76
78
|
```bash
|
|
77
|
-
#
|
|
79
|
+
# Basic conversion
|
|
78
80
|
markitai document.docx
|
|
79
81
|
|
|
80
|
-
# URL
|
|
82
|
+
# URL conversion
|
|
81
83
|
markitai https://example.com/article
|
|
82
84
|
|
|
83
|
-
# LLM
|
|
85
|
+
# LLM enhancement
|
|
84
86
|
markitai document.docx --llm
|
|
85
87
|
|
|
86
|
-
#
|
|
88
|
+
# Using presets
|
|
87
89
|
markitai document.pdf --preset rich # LLM + alt + desc + screenshot
|
|
88
90
|
markitai document.pdf --preset standard # LLM + alt + desc
|
|
89
|
-
markitai document.pdf --preset minimal #
|
|
91
|
+
markitai document.pdf --preset minimal # Basic conversion only
|
|
90
92
|
|
|
91
|
-
#
|
|
93
|
+
# Batch processing
|
|
92
94
|
markitai ./docs -o ./output
|
|
93
95
|
|
|
94
|
-
#
|
|
96
|
+
# Resume interrupted job
|
|
95
97
|
markitai ./docs -o ./output --resume
|
|
96
98
|
|
|
97
|
-
# URL
|
|
99
|
+
# Batch URL processing (auto-detect .urls files)
|
|
98
100
|
markitai urls.urls -o ./output
|
|
99
101
|
```
|
|
100
102
|
|
|
101
|
-
##
|
|
103
|
+
## Output Structure
|
|
102
104
|
|
|
103
105
|
```
|
|
104
106
|
output/
|
|
105
|
-
├── document.docx.md #
|
|
106
|
-
├── document.docx.llm.md # LLM
|
|
107
|
+
├── document.docx.md # Basic Markdown
|
|
108
|
+
├── document.docx.llm.md # LLM-enhanced version
|
|
107
109
|
├── assets/
|
|
108
110
|
│ ├── document.docx.0001.jpg
|
|
109
|
-
│ └── images.json #
|
|
110
|
-
├── screenshots/ #
|
|
111
|
+
│ └── images.json # Image descriptions
|
|
112
|
+
├── screenshots/ # Page screenshots (with --screenshot)
|
|
111
113
|
│ └── example_com.full.jpg
|
|
112
114
|
```
|
|
113
115
|
|
|
114
|
-
##
|
|
116
|
+
## Configuration
|
|
115
117
|
|
|
116
|
-
|
|
118
|
+
Priority: CLI arguments > Environment variables > Config file > Defaults
|
|
117
119
|
|
|
118
120
|
```bash
|
|
119
|
-
#
|
|
121
|
+
# View configuration
|
|
120
122
|
markitai config list
|
|
121
123
|
|
|
122
|
-
#
|
|
124
|
+
# Initialize config file
|
|
123
125
|
markitai config init -o .
|
|
124
126
|
|
|
125
|
-
#
|
|
127
|
+
# View cache status
|
|
126
128
|
markitai cache stats
|
|
127
129
|
|
|
128
|
-
#
|
|
130
|
+
# Clear cache
|
|
129
131
|
markitai cache clear
|
|
130
132
|
```
|
|
131
133
|
|
|
132
|
-
|
|
134
|
+
Config file location: `./markitai.json` or `~/.markitai/config.json`
|
|
133
135
|
|
|
134
|
-
##
|
|
136
|
+
## Environment Variables
|
|
135
137
|
|
|
136
|
-
|
|
|
137
|
-
|
|
138
|
+
| Variable | Description |
|
|
139
|
+
|----------|-------------|
|
|
138
140
|
| `OPENAI_API_KEY` | OpenAI API Key |
|
|
139
141
|
| `GEMINI_API_KEY` | Google Gemini API Key |
|
|
140
142
|
| `DEEPSEEK_API_KEY` | DeepSeek API Key |
|
|
141
143
|
| `ANTHROPIC_API_KEY` | Anthropic API Key |
|
|
142
|
-
| `JINA_API_KEY` | Jina Reader API Key
|
|
144
|
+
| `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
|
|
143
145
|
|
|
144
|
-
##
|
|
146
|
+
## Dependencies
|
|
145
147
|
|
|
146
|
-
- [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF
|
|
147
|
-
- [markitdown](https://github.com/microsoft/markitdown) - Office
|
|
148
|
-
- [LiteLLM](https://github.com/BerriAI/litellm) - LLM
|
|
149
|
-
- [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR
|
|
148
|
+
- [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
|
|
149
|
+
- [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
|
|
150
|
+
- [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
|
|
151
|
+
- [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
|
|
150
152
|
|
|
151
|
-
##
|
|
153
|
+
## Documentation
|
|
152
154
|
|
|
153
|
-
- [
|
|
154
|
-
- [
|
|
155
|
-
- [CLI
|
|
155
|
+
- [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
|
|
156
|
+
- [Configuration](https://markitai.ynewtime.com/guide/configuration)
|
|
157
|
+
- [CLI Reference](https://markitai.ynewtime.com/guide/cli)
|
|
156
158
|
|
|
157
159
|
## License
|
|
158
160
|
|
markitai-0.3.1/README.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Markitai
|
|
2
|
+
|
|
3
|
+
Opinionated Markdown converter with native LLM enhancement support.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
|
|
8
|
+
- **LLM Enhancement** - Format cleaning, metadata generation, image analysis
|
|
9
|
+
- **Batch Processing** - Concurrent conversion, resume capability, progress display
|
|
10
|
+
- **OCR Recognition** - Text extraction from scanned PDFs and images
|
|
11
|
+
- **URL Conversion** - Direct webpage conversion with SPA browser rendering support
|
|
12
|
+
- **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### One-Click Setup (Recommended)
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Linux/macOS
|
|
20
|
+
curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
|
|
21
|
+
|
|
22
|
+
# Windows (PowerShell)
|
|
23
|
+
irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Manual Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Requires Python 3.11+
|
|
30
|
+
uv tool install markitai
|
|
31
|
+
|
|
32
|
+
# Or using pip
|
|
33
|
+
pip install --user markitai
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Basic conversion
|
|
40
|
+
markitai document.docx
|
|
41
|
+
|
|
42
|
+
# URL conversion
|
|
43
|
+
markitai https://example.com/article
|
|
44
|
+
|
|
45
|
+
# LLM enhancement
|
|
46
|
+
markitai document.docx --llm
|
|
47
|
+
|
|
48
|
+
# Using presets
|
|
49
|
+
markitai document.pdf --preset rich # LLM + alt + desc + screenshot
|
|
50
|
+
markitai document.pdf --preset standard # LLM + alt + desc
|
|
51
|
+
markitai document.pdf --preset minimal # Basic conversion only
|
|
52
|
+
|
|
53
|
+
# Batch processing
|
|
54
|
+
markitai ./docs -o ./output
|
|
55
|
+
|
|
56
|
+
# Resume interrupted job
|
|
57
|
+
markitai ./docs -o ./output --resume
|
|
58
|
+
|
|
59
|
+
# Batch URL processing (auto-detect .urls files)
|
|
60
|
+
markitai urls.urls -o ./output
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Output Structure
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
output/
|
|
67
|
+
├── document.docx.md # Basic Markdown
|
|
68
|
+
├── document.docx.llm.md # LLM-enhanced version
|
|
69
|
+
├── assets/
|
|
70
|
+
│ ├── document.docx.0001.jpg
|
|
71
|
+
│ └── images.json # Image descriptions
|
|
72
|
+
├── screenshots/ # Page screenshots (with --screenshot)
|
|
73
|
+
│ └── example_com.full.jpg
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Configuration
|
|
77
|
+
|
|
78
|
+
Priority: CLI arguments > Environment variables > Config file > Defaults
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# View configuration
|
|
82
|
+
markitai config list
|
|
83
|
+
|
|
84
|
+
# Initialize config file
|
|
85
|
+
markitai config init -o .
|
|
86
|
+
|
|
87
|
+
# View cache status
|
|
88
|
+
markitai cache stats
|
|
89
|
+
|
|
90
|
+
# Clear cache
|
|
91
|
+
markitai cache clear
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Config file location: `./markitai.json` or `~/.markitai/config.json`
|
|
95
|
+
|
|
96
|
+
## Environment Variables
|
|
97
|
+
|
|
98
|
+
| Variable | Description |
|
|
99
|
+
|----------|-------------|
|
|
100
|
+
| `OPENAI_API_KEY` | OpenAI API Key |
|
|
101
|
+
| `GEMINI_API_KEY` | Google Gemini API Key |
|
|
102
|
+
| `DEEPSEEK_API_KEY` | DeepSeek API Key |
|
|
103
|
+
| `ANTHROPIC_API_KEY` | Anthropic API Key |
|
|
104
|
+
| `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
|
|
105
|
+
|
|
106
|
+
## Dependencies
|
|
107
|
+
|
|
108
|
+
- [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
|
|
109
|
+
- [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
|
|
110
|
+
- [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
|
|
111
|
+
- [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
|
|
112
|
+
|
|
113
|
+
## Documentation
|
|
114
|
+
|
|
115
|
+
- [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
|
|
116
|
+
- [Configuration](https://markitai.ynewtime.com/guide/configuration)
|
|
117
|
+
- [CLI Reference](https://markitai.ynewtime.com/guide/cli)
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
MIT
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "markitai"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.1"
|
|
4
4
|
description = "Document to Markdown converter with LLM enhancement"
|
|
5
5
|
license = "MIT"
|
|
6
6
|
readme = "README.md"
|
|
@@ -32,6 +32,7 @@ dependencies = [
|
|
|
32
32
|
"loguru>=0.7.3",
|
|
33
33
|
"rich>=14.2.0",
|
|
34
34
|
"Pillow>=12.1.0",
|
|
35
|
+
"opencv-python>=4.8.0",
|
|
35
36
|
"aiofiles>=25.1.0",
|
|
36
37
|
"pydantic>=2.10.0",
|
|
37
38
|
"python-dotenv>=1.2.1",
|
|
@@ -1037,6 +1037,15 @@ class BatchProcessor:
|
|
|
1037
1037
|
self.save_state(force=True)
|
|
1038
1038
|
return self.state
|
|
1039
1039
|
|
|
1040
|
+
# Preheat OCR engine if OCR is enabled to eliminate cold start delay
|
|
1041
|
+
if options and options.get("ocr_enabled"):
|
|
1042
|
+
try:
|
|
1043
|
+
from markitai.ocr import OCRProcessor
|
|
1044
|
+
|
|
1045
|
+
OCRProcessor.preheat()
|
|
1046
|
+
except ImportError:
|
|
1047
|
+
logger.debug("OCR preheat skipped: RapidOCR not installed")
|
|
1048
|
+
|
|
1040
1049
|
# Create semaphore for concurrency control
|
|
1041
1050
|
semaphore = asyncio.Semaphore(self.config.concurrency)
|
|
1042
1051
|
|
|
@@ -978,9 +978,16 @@ def app(
|
|
|
978
978
|
# Cleanup shared resources
|
|
979
979
|
await close_shared_clients() # Close httpx.AsyncClient for Jina
|
|
980
980
|
shutdown_converter_executor() # Shutdown ThreadPoolExecutor
|
|
981
|
-
|
|
982
|
-
#
|
|
983
|
-
|
|
981
|
+
|
|
982
|
+
# Close LiteLLM's aiohttp sessions to prevent "Unclosed connection" warning
|
|
983
|
+
try:
|
|
984
|
+
from litellm.llms.custom_httpx.async_client_cleanup import (
|
|
985
|
+
close_litellm_async_clients,
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
await close_litellm_async_clients()
|
|
989
|
+
except Exception:
|
|
990
|
+
pass # Ignore cleanup errors
|
|
984
991
|
|
|
985
992
|
asyncio.run(run_workflow_with_cleanup())
|
|
986
993
|
|
|
@@ -1344,13 +1351,18 @@ def cache_stats(as_json: bool, verbose: bool, limit: int, scope: str) -> None:
|
|
|
1344
1351
|
default="project",
|
|
1345
1352
|
help="Which cache to clear (default: project).",
|
|
1346
1353
|
)
|
|
1354
|
+
@click.option(
|
|
1355
|
+
"--include-spa-domains",
|
|
1356
|
+
is_flag=True,
|
|
1357
|
+
help="Also clear learned SPA domains.",
|
|
1358
|
+
)
|
|
1347
1359
|
@click.option(
|
|
1348
1360
|
"--yes",
|
|
1349
1361
|
"-y",
|
|
1350
1362
|
is_flag=True,
|
|
1351
1363
|
help="Skip confirmation prompt.",
|
|
1352
1364
|
)
|
|
1353
|
-
def cache_clear(scope: str, yes: bool) -> None:
|
|
1365
|
+
def cache_clear(scope: str, include_spa_domains: bool, yes: bool) -> None:
|
|
1354
1366
|
"""Clear cache entries."""
|
|
1355
1367
|
from markitai.constants import (
|
|
1356
1368
|
DEFAULT_CACHE_DB_FILENAME,
|
|
@@ -1368,11 +1380,14 @@ def cache_clear(scope: str, yes: bool) -> None:
|
|
|
1368
1380
|
"global": "global cache (~/.markitai)",
|
|
1369
1381
|
"all": "ALL caches (project + global)",
|
|
1370
1382
|
}
|
|
1371
|
-
|
|
1383
|
+
desc = scope_desc[scope]
|
|
1384
|
+
if include_spa_domains:
|
|
1385
|
+
desc += " + learned SPA domains"
|
|
1386
|
+
if not click.confirm(f"Clear {desc}?"):
|
|
1372
1387
|
console.print("[yellow]Aborted[/yellow]")
|
|
1373
1388
|
return
|
|
1374
1389
|
|
|
1375
|
-
result = {"project": 0, "global": 0}
|
|
1390
|
+
result = {"project": 0, "global": 0, "spa_domains": 0}
|
|
1376
1391
|
|
|
1377
1392
|
# Clear project cache
|
|
1378
1393
|
if scope in ("project", "all"):
|
|
@@ -1400,18 +1415,105 @@ def cache_clear(scope: str, yes: bool) -> None:
|
|
|
1400
1415
|
except Exception as e:
|
|
1401
1416
|
console.print(f"[red]Failed to clear global cache:[/red] {e}")
|
|
1402
1417
|
|
|
1418
|
+
# Clear SPA domains if requested
|
|
1419
|
+
if include_spa_domains:
|
|
1420
|
+
from markitai.fetch import get_spa_domain_cache
|
|
1421
|
+
|
|
1422
|
+
try:
|
|
1423
|
+
spa_cache = get_spa_domain_cache()
|
|
1424
|
+
result["spa_domains"] = spa_cache.clear()
|
|
1425
|
+
except Exception as e:
|
|
1426
|
+
console.print(f"[red]Failed to clear SPA domains:[/red] {e}")
|
|
1427
|
+
|
|
1403
1428
|
# Report results
|
|
1404
1429
|
total = result["project"] + result["global"]
|
|
1405
|
-
if total > 0:
|
|
1430
|
+
if total > 0 or result["spa_domains"] > 0:
|
|
1406
1431
|
console.print(f"[green]Cleared {total} cache entries[/green]")
|
|
1407
1432
|
if result["project"] > 0:
|
|
1408
1433
|
console.print(f" Project: {result['project']}")
|
|
1409
1434
|
if result["global"] > 0:
|
|
1410
1435
|
console.print(f" Global: {result['global']}")
|
|
1436
|
+
if result["spa_domains"] > 0:
|
|
1437
|
+
console.print(f" SPA domains: {result['spa_domains']}")
|
|
1411
1438
|
else:
|
|
1412
1439
|
console.print("[dim]No cache entries to clear[/dim]")
|
|
1413
1440
|
|
|
1414
1441
|
|
|
1442
|
+
@cache.command("spa-domains")
|
|
1443
|
+
@click.option(
|
|
1444
|
+
"--json",
|
|
1445
|
+
"as_json",
|
|
1446
|
+
is_flag=True,
|
|
1447
|
+
help="Output as JSON.",
|
|
1448
|
+
)
|
|
1449
|
+
@click.option(
|
|
1450
|
+
"--clear",
|
|
1451
|
+
is_flag=True,
|
|
1452
|
+
help="Clear all learned SPA domains.",
|
|
1453
|
+
)
|
|
1454
|
+
def cache_spa_domains(as_json: bool, clear: bool) -> None:
|
|
1455
|
+
"""View or manage learned SPA domains.
|
|
1456
|
+
|
|
1457
|
+
Shows domains that were automatically detected as requiring browser
|
|
1458
|
+
rendering (JavaScript-heavy sites). These domains will use browser
|
|
1459
|
+
strategy directly on future requests, avoiding wasted static fetch attempts.
|
|
1460
|
+
"""
|
|
1461
|
+
from rich.table import Table
|
|
1462
|
+
|
|
1463
|
+
from markitai.fetch import get_spa_domain_cache
|
|
1464
|
+
|
|
1465
|
+
spa_cache = get_spa_domain_cache()
|
|
1466
|
+
|
|
1467
|
+
if clear:
|
|
1468
|
+
count = spa_cache.clear()
|
|
1469
|
+
if as_json:
|
|
1470
|
+
console.print(json.dumps({"cleared": count}))
|
|
1471
|
+
else:
|
|
1472
|
+
console.print(f"[green]Cleared {count} learned SPA domains[/green]")
|
|
1473
|
+
return
|
|
1474
|
+
|
|
1475
|
+
domains = spa_cache.list_domains()
|
|
1476
|
+
|
|
1477
|
+
if as_json:
|
|
1478
|
+
console.print(json.dumps(domains, indent=2, ensure_ascii=False), soft_wrap=True)
|
|
1479
|
+
return
|
|
1480
|
+
|
|
1481
|
+
if not domains:
|
|
1482
|
+
console.print("[dim]No learned SPA domains yet[/dim]")
|
|
1483
|
+
console.print(
|
|
1484
|
+
"\n[dim]Domains are learned automatically when static fetch "
|
|
1485
|
+
"detects JavaScript requirement.[/dim]"
|
|
1486
|
+
)
|
|
1487
|
+
return
|
|
1488
|
+
|
|
1489
|
+
console.print(f"[bold]Learned SPA Domains[/bold] ({len(domains)} total)\n")
|
|
1490
|
+
|
|
1491
|
+
table = Table()
|
|
1492
|
+
table.add_column("Domain", style="cyan")
|
|
1493
|
+
table.add_column("Hits", justify="right")
|
|
1494
|
+
table.add_column("Learned At", style="dim")
|
|
1495
|
+
table.add_column("Last Hit", style="dim")
|
|
1496
|
+
table.add_column("Status")
|
|
1497
|
+
|
|
1498
|
+
for d in domains:
|
|
1499
|
+
status = "[red]Expired[/red]" if d.get("expired") else "[green]Active[/green]"
|
|
1500
|
+
learned_at = d.get("learned_at", "")[:10] if d.get("learned_at") else "-"
|
|
1501
|
+
last_hit = d.get("last_hit", "")[:10] if d.get("last_hit") else "-"
|
|
1502
|
+
table.add_row(
|
|
1503
|
+
d["domain"],
|
|
1504
|
+
str(d.get("hits", 0)),
|
|
1505
|
+
learned_at,
|
|
1506
|
+
last_hit,
|
|
1507
|
+
status,
|
|
1508
|
+
)
|
|
1509
|
+
|
|
1510
|
+
console.print(table)
|
|
1511
|
+
console.print(
|
|
1512
|
+
"\n[dim]Tip: Use --clear to reset learned domains, "
|
|
1513
|
+
"or configure fallback_patterns in config file for permanent rules.[/dim]"
|
|
1514
|
+
)
|
|
1515
|
+
|
|
1516
|
+
|
|
1415
1517
|
# =============================================================================
|
|
1416
1518
|
# Check dependencies command
|
|
1417
1519
|
# =============================================================================
|
|
@@ -3097,7 +3199,7 @@ def _check_agent_browser_for_urls(cfg: Any, console: Any) -> None:
|
|
|
3097
3199
|
from rich.panel import Panel
|
|
3098
3200
|
|
|
3099
3201
|
warning_text = (
|
|
3100
|
-
f"[yellow]
|
|
3202
|
+
f"[yellow]{message}[/yellow]\n\n"
|
|
3101
3203
|
"[dim]URL processing will fall back to static fetch strategy.\n"
|
|
3102
3204
|
"For JavaScript-rendered pages (Twitter/X, etc.), browser support is recommended.\n\n"
|
|
3103
3205
|
"To install browser support:[/dim]\n"
|
|
@@ -203,17 +203,40 @@ class ScreenshotConfig(BaseModel):
|
|
|
203
203
|
|
|
204
204
|
|
|
205
205
|
class PromptsConfig(BaseModel):
|
|
206
|
-
"""Prompts configuration.
|
|
206
|
+
"""Prompts configuration for custom prompt overrides.
|
|
207
|
+
|
|
208
|
+
Each prompt has a system and user variant:
|
|
209
|
+
- system: Contains instructions and context
|
|
210
|
+
- user: Contains the actual request with content placeholders
|
|
211
|
+
"""
|
|
207
212
|
|
|
208
213
|
dir: str = DEFAULT_PROMPTS_DIR
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
# Cleaner prompts
|
|
215
|
+
cleaner_system: str | None = None
|
|
216
|
+
cleaner_user: str | None = None
|
|
217
|
+
# Frontmatter prompts
|
|
218
|
+
frontmatter_system: str | None = None
|
|
219
|
+
frontmatter_user: str | None = None
|
|
220
|
+
# Image prompts
|
|
221
|
+
image_caption_system: str | None = None
|
|
222
|
+
image_caption_user: str | None = None
|
|
223
|
+
image_description_system: str | None = None
|
|
224
|
+
image_description_user: str | None = None
|
|
225
|
+
image_analysis_system: str | None = None
|
|
226
|
+
image_analysis_user: str | None = None
|
|
227
|
+
# Page content prompts
|
|
228
|
+
page_content_system: str | None = None
|
|
229
|
+
page_content_user: str | None = None
|
|
230
|
+
# Document prompts
|
|
231
|
+
document_enhance_system: str | None = None
|
|
232
|
+
document_enhance_user: str | None = None
|
|
233
|
+
document_enhance_complete_system: str | None = None
|
|
234
|
+
document_enhance_complete_user: str | None = None
|
|
235
|
+
document_process_system: str | None = None
|
|
236
|
+
document_process_user: str | None = None
|
|
237
|
+
# URL prompts
|
|
238
|
+
url_enhance_system: str | None = None
|
|
239
|
+
url_enhance_user: str | None = None
|
|
217
240
|
|
|
218
241
|
|
|
219
242
|
class BatchConfig(BaseModel):
|