markitai 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {markitai-0.3.0 → markitai-0.3.1}/PKG-INFO +49 -47
  2. markitai-0.3.1/README.md +121 -0
  3. {markitai-0.3.0 → markitai-0.3.1}/pyproject.toml +2 -1
  4. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/__init__.py +1 -1
  5. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/batch.py +9 -0
  6. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/cli.py +110 -8
  7. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/config.py +32 -9
  8. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/config.schema.json +85 -81
  9. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/constants.py +2 -2
  10. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/_patches.py +1 -1
  11. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/pdf.py +6 -2
  12. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/fetch.py +1028 -148
  13. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/image.py +145 -3
  14. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/llm.py +292 -157
  15. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/ocr.py +103 -32
  16. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/prompts/__init__.py +27 -11
  17. markitai-0.3.1/src/markitai/prompts/cleaner_system.md +33 -0
  18. markitai-0.3.1/src/markitai/prompts/cleaner_user.md +3 -0
  19. markitai-0.3.0/src/markitai/prompts/document_enhance_complete.md → markitai-0.3.1/src/markitai/prompts/document_enhance_complete_system.md +17 -12
  20. markitai-0.3.1/src/markitai/prompts/document_enhance_complete_user.md +5 -0
  21. markitai-0.3.0/src/markitai/prompts/document_enhance.md → markitai-0.3.1/src/markitai/prompts/document_enhance_system.md +8 -7
  22. markitai-0.3.1/src/markitai/prompts/document_enhance_user.md +5 -0
  23. markitai-0.3.1/src/markitai/prompts/document_process_system.md +39 -0
  24. markitai-0.3.1/src/markitai/prompts/document_process_user.md +5 -0
  25. markitai-0.3.1/src/markitai/prompts/frontmatter_system.md +24 -0
  26. markitai-0.3.1/src/markitai/prompts/frontmatter_user.md +5 -0
  27. markitai-0.3.0/src/markitai/prompts/image_analysis.md → markitai-0.3.1/src/markitai/prompts/image_analysis_system.md +10 -4
  28. markitai-0.3.1/src/markitai/prompts/image_analysis_user.md +1 -0
  29. markitai-0.3.0/src/markitai/prompts/image_caption.md → markitai-0.3.1/src/markitai/prompts/image_caption_system.md +6 -2
  30. markitai-0.3.1/src/markitai/prompts/image_caption_user.md +1 -0
  31. markitai-0.3.0/src/markitai/prompts/image_description.md → markitai-0.3.1/src/markitai/prompts/image_description_system.md +6 -3
  32. markitai-0.3.1/src/markitai/prompts/image_description_user.md +1 -0
  33. markitai-0.3.0/src/markitai/prompts/page_content.md → markitai-0.3.1/src/markitai/prompts/page_content_system.md +5 -2
  34. markitai-0.3.1/src/markitai/prompts/page_content_user.md +1 -0
  35. markitai-0.3.0/src/markitai/prompts/url_enhance.md → markitai-0.3.1/src/markitai/prompts/url_enhance_system.md +17 -7
  36. markitai-0.3.1/src/markitai/prompts/url_enhance_user.md +5 -0
  37. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/executor.py +18 -1
  38. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/text.py +35 -0
  39. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/core.py +6 -0
  40. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/single.py +6 -3
  41. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/sub_dir/file_example_PPT_250kB.ppt +0 -0
  42. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/sub_dir/file_example_XLS_100.xls +0 -0
  43. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_executor.py +51 -0
  44. markitai-0.3.1/tests/unit/test_fetch.py +789 -0
  45. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_image.py +214 -1
  46. markitai-0.3.1/tests/unit/test_ocr.py +355 -0
  47. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_prompts.py +46 -35
  48. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_schema_sync.py +11 -13
  49. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_workflow_core.py +26 -0
  50. markitai-0.3.0/README.md +0 -120
  51. markitai-0.3.0/src/markitai/prompts/cleaner.md +0 -93
  52. markitai-0.3.0/src/markitai/prompts/document_process.md +0 -60
  53. markitai-0.3.0/src/markitai/prompts/frontmatter.md +0 -28
  54. markitai-0.3.0/tests/unit/test_fetch.py +0 -360
  55. markitai-0.3.0/tests/unit/test_ocr.py +0 -209
  56. {markitai-0.3.0 → markitai-0.3.1}/.gitignore +0 -0
  57. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/__init__.py +0 -0
  58. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/base.py +0 -0
  59. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/image.py +0 -0
  60. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/legacy.py +0 -0
  61. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/office.py +0 -0
  62. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/converter/text.py +0 -0
  63. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/json_order.py +0 -0
  64. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/security.py +0 -0
  65. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/types.py +0 -0
  66. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/urls.py +0 -0
  67. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/__init__.py +0 -0
  68. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/mime.py +0 -0
  69. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/office.py +0 -0
  70. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/output.py +0 -0
  71. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/utils/paths.py +0 -0
  72. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/__init__.py +0 -0
  73. {markitai-0.3.0 → markitai-0.3.1}/src/markitai/workflow/helpers.py +0 -0
  74. {markitai-0.3.0 → markitai-0.3.1}/tests/SKILL.md +0 -0
  75. {markitai-0.3.0 → markitai-0.3.1}/tests/__init__.py +0 -0
  76. {markitai-0.3.0 → markitai-0.3.1}/tests/conftest.py +0 -0
  77. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/Free_Test_Data_500KB_PPTX.pptx +0 -0
  78. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/candy.JPG +0 -0
  79. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/file-example_PDF_500_kB.pdf +0 -0
  80. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/file_example_XLSX_100.xlsx +0 -0
  81. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/sub_dir/file-sample_100kB.doc +0 -0
  82. {markitai-0.3.0 → markitai-0.3.1}/tests/fixtures/test.urls +0 -0
  83. {markitai-0.3.0 → markitai-0.3.1}/tests/integration/__init__.py +0 -0
  84. {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_cache.py +0 -0
  85. {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_cli.py +0 -0
  86. {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_output_format.py +0 -0
  87. {markitai-0.3.0 → markitai-0.3.1}/tests/integration/test_url.py +0 -0
  88. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/__init__.py +0 -0
  89. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_atomic.py +0 -0
  90. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_batch.py +0 -0
  91. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_cli_helpers.py +0 -0
  92. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_config.py +0 -0
  93. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_converter.py +0 -0
  94. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_image_converter.py +0 -0
  95. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_json_order.py +0 -0
  96. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_llm.py +0 -0
  97. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_llm_runtime.py +0 -0
  98. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_security.py +0 -0
  99. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_workflow_helpers.py +0 -0
  100. {markitai-0.3.0 → markitai-0.3.1}/tests/unit/test_workflow_single.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: markitai
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Document to Markdown converter with LLM enhancement
5
5
  Project-URL: Homepage, https://markitai.ynewtime.com
6
6
  Project-URL: Documentation, https://markitai.ynewtime.com/guide/getting-started
@@ -27,6 +27,7 @@ Requires-Dist: instructor>=1.14.0
27
27
  Requires-Dist: litellm>=1.80.16
28
28
  Requires-Dist: loguru>=0.7.3
29
29
  Requires-Dist: markitdown[all]>=0.1.4
30
+ Requires-Dist: opencv-python>=4.8.0
30
31
  Requires-Dist: pillow>=12.1.0
31
32
  Requires-Dist: pydantic>=2.10.0
32
33
  Requires-Dist: pymupdf4llm>=0.2.9
@@ -39,19 +40,20 @@ Description-Content-Type: text/markdown
39
40
 
40
41
  # Markitai
41
42
 
42
- 开箱即用的 Markdown 转换器,原生支持 LLM 增强。
43
+ Opinionated Markdown converter with native LLM enhancement support.
43
44
 
44
- ## 特性
45
+ ## Features
45
46
 
46
- - **多格式支持** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
47
- - **LLM 增强** - 格式清洗、元数据生成、图片分析
48
- - **批量处理** - 并发转换、断点恢复、进度显示
49
- - **OCR 识别** - 扫描版 PDF 和图片文字提取
50
- - **URL 转换** - 直接转换网页,支持 SPA 浏览器渲染
47
+ - **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
48
+ - **LLM Enhancement** - Format cleaning, metadata generation, image analysis
49
+ - **Batch Processing** - Concurrent conversion, resume capability, progress display
50
+ - **OCR Recognition** - Text extraction from scanned PDFs and images
51
+ - **URL Conversion** - Direct webpage conversion with SPA browser rendering support
52
+ - **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
51
53
 
52
- ## 安装
54
+ ## Installation
53
55
 
54
- ### 一键安装(推荐)
56
+ ### One-Click Setup (Recommended)
55
57
 
56
58
  ```bash
57
59
  # Linux/macOS
@@ -61,98 +63,98 @@ curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setu
61
63
  irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
62
64
  ```
63
65
 
64
- ### 手动安装
66
+ ### Manual Installation
65
67
 
66
68
  ```bash
67
- # 需要 Python 3.11+
69
+ # Requires Python 3.11+
68
70
  uv tool install markitai
69
71
 
70
- # 或使用 pip
72
+ # Or using pip
71
73
  pip install --user markitai
72
74
  ```
73
75
 
74
- ## 快速开始
76
+ ## Quick Start
75
77
 
76
78
  ```bash
77
- # 基础转换
79
+ # Basic conversion
78
80
  markitai document.docx
79
81
 
80
- # URL 转换
82
+ # URL conversion
81
83
  markitai https://example.com/article
82
84
 
83
- # LLM 增强
85
+ # LLM enhancement
84
86
  markitai document.docx --llm
85
87
 
86
- # 使用预设
88
+ # Using presets
87
89
  markitai document.pdf --preset rich # LLM + alt + desc + screenshot
88
90
  markitai document.pdf --preset standard # LLM + alt + desc
89
- markitai document.pdf --preset minimal # 仅基础转换
91
+ markitai document.pdf --preset minimal # Basic conversion only
90
92
 
91
- # 批量处理
93
+ # Batch processing
92
94
  markitai ./docs -o ./output
93
95
 
94
- # 断点恢复
96
+ # Resume interrupted job
95
97
  markitai ./docs -o ./output --resume
96
98
 
97
- # URL 批量处理(自动识别 .urls 文件)
99
+ # Batch URL processing (auto-detect .urls files)
98
100
  markitai urls.urls -o ./output
99
101
  ```
100
102
 
101
- ## 输出结构
103
+ ## Output Structure
102
104
 
103
105
  ```
104
106
  output/
105
- ├── document.docx.md # 基础 Markdown
106
- ├── document.docx.llm.md # LLM 优化版
107
+ ├── document.docx.md # Basic Markdown
108
+ ├── document.docx.llm.md # LLM-enhanced version
107
109
  ├── assets/
108
110
  │ ├── document.docx.0001.jpg
109
- │ └── images.json # 图片描述
110
- ├── screenshots/ # 页面截图(--screenshot 时)
111
+ │ └── images.json # Image descriptions
112
+ ├── screenshots/ # Page screenshots (with --screenshot)
111
113
  │ └── example_com.full.jpg
112
114
  ```
113
115
 
114
- ## 配置
116
+ ## Configuration
115
117
 
116
- 优先级:命令行 > 环境变量 > 配置文件 > 默认值
118
+ Priority: CLI arguments > Environment variables > Config file > Defaults
117
119
 
118
120
  ```bash
119
- # 查看配置
121
+ # View configuration
120
122
  markitai config list
121
123
 
122
- # 初始化配置文件
124
+ # Initialize config file
123
125
  markitai config init -o .
124
126
 
125
- # 查看缓存状态
127
+ # View cache status
126
128
  markitai cache stats
127
129
 
128
- # 清理缓存
130
+ # Clear cache
129
131
  markitai cache clear
130
132
  ```
131
133
 
132
- 配置文件路径:`./markitai.json` `~/.markitai/config.json`
134
+ Config file location: `./markitai.json` or `~/.markitai/config.json`
133
135
 
134
- ## 环境变量
136
+ ## Environment Variables
135
137
 
136
- | 变量 | 说明 |
137
- |------|------|
138
+ | Variable | Description |
139
+ |----------|-------------|
138
140
  | `OPENAI_API_KEY` | OpenAI API Key |
139
141
  | `GEMINI_API_KEY` | Google Gemini API Key |
140
142
  | `DEEPSEEK_API_KEY` | DeepSeek API Key |
141
143
  | `ANTHROPIC_API_KEY` | Anthropic API Key |
142
- | `JINA_API_KEY` | Jina Reader API KeyURL 转换) |
144
+ | `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
143
145
 
144
- ## 依赖
146
+ ## Dependencies
145
147
 
146
- - [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF 转换
147
- - [markitdown](https://github.com/microsoft/markitdown) - Office 文档和 URL 转换
148
- - [LiteLLM](https://github.com/BerriAI/litellm) - LLM 网关
149
- - [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR 识别
148
+ - [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
149
+ - [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
150
+ - [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
151
+ - [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
150
152
 
151
- ## 文档
153
+ ## Documentation
152
154
 
153
- - [快速开始](https://ynewtime.github.io/markitai/guide/getting-started)
154
- - [配置说明](https://ynewtime.github.io/markitai/guide/configuration)
155
- - [CLI 命令参考](https://ynewtime.github.io/markitai/guide/cli)
155
+ - [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
156
+ - [Configuration](https://markitai.ynewtime.com/guide/configuration)
157
+ - [CLI Reference](https://markitai.ynewtime.com/guide/cli)
156
158
 
157
159
  ## License
158
160
 
@@ -0,0 +1,121 @@
1
+ # Markitai
2
+
3
+ Opinionated Markdown converter with native LLM enhancement support.
4
+
5
+ ## Features
6
+
7
+ - **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
8
+ - **LLM Enhancement** - Format cleaning, metadata generation, image analysis
9
+ - **Batch Processing** - Concurrent conversion, resume capability, progress display
10
+ - **OCR Recognition** - Text extraction from scanned PDFs and images
11
+ - **URL Conversion** - Direct webpage conversion with SPA browser rendering support
12
+ - **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
13
+
14
+ ## Installation
15
+
16
+ ### One-Click Setup (Recommended)
17
+
18
+ ```bash
19
+ # Linux/macOS
20
+ curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
21
+
22
+ # Windows (PowerShell)
23
+ irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
24
+ ```
25
+
26
+ ### Manual Installation
27
+
28
+ ```bash
29
+ # Requires Python 3.11+
30
+ uv tool install markitai
31
+
32
+ # Or using pip
33
+ pip install --user markitai
34
+ ```
35
+
36
+ ## Quick Start
37
+
38
+ ```bash
39
+ # Basic conversion
40
+ markitai document.docx
41
+
42
+ # URL conversion
43
+ markitai https://example.com/article
44
+
45
+ # LLM enhancement
46
+ markitai document.docx --llm
47
+
48
+ # Using presets
49
+ markitai document.pdf --preset rich # LLM + alt + desc + screenshot
50
+ markitai document.pdf --preset standard # LLM + alt + desc
51
+ markitai document.pdf --preset minimal # Basic conversion only
52
+
53
+ # Batch processing
54
+ markitai ./docs -o ./output
55
+
56
+ # Resume interrupted job
57
+ markitai ./docs -o ./output --resume
58
+
59
+ # Batch URL processing (auto-detect .urls files)
60
+ markitai urls.urls -o ./output
61
+ ```
62
+
63
+ ## Output Structure
64
+
65
+ ```
66
+ output/
67
+ ├── document.docx.md # Basic Markdown
68
+ ├── document.docx.llm.md # LLM-enhanced version
69
+ ├── assets/
70
+ │ ├── document.docx.0001.jpg
71
+ │ └── images.json # Image descriptions
72
+ ├── screenshots/ # Page screenshots (with --screenshot)
73
+ │ └── example_com.full.jpg
74
+ ```
75
+
76
+ ## Configuration
77
+
78
+ Priority: CLI arguments > Environment variables > Config file > Defaults
79
+
80
+ ```bash
81
+ # View configuration
82
+ markitai config list
83
+
84
+ # Initialize config file
85
+ markitai config init -o .
86
+
87
+ # View cache status
88
+ markitai cache stats
89
+
90
+ # Clear cache
91
+ markitai cache clear
92
+ ```
93
+
94
+ Config file location: `./markitai.json` or `~/.markitai/config.json`
95
+
96
+ ## Environment Variables
97
+
98
+ | Variable | Description |
99
+ |----------|-------------|
100
+ | `OPENAI_API_KEY` | OpenAI API Key |
101
+ | `GEMINI_API_KEY` | Google Gemini API Key |
102
+ | `DEEPSEEK_API_KEY` | DeepSeek API Key |
103
+ | `ANTHROPIC_API_KEY` | Anthropic API Key |
104
+ | `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
105
+
106
+ ## Dependencies
107
+
108
+ - [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
109
+ - [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
110
+ - [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
111
+ - [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
112
+
113
+ ## Documentation
114
+
115
+ - [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
116
+ - [Configuration](https://markitai.ynewtime.com/guide/configuration)
117
+ - [CLI Reference](https://markitai.ynewtime.com/guide/cli)
118
+
119
+ ## License
120
+
121
+ MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "markitai"
3
- version = "0.3.0"
3
+ version = "0.3.1"
4
4
  description = "Document to Markdown converter with LLM enhancement"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -32,6 +32,7 @@ dependencies = [
32
32
  "loguru>=0.7.3",
33
33
  "rich>=14.2.0",
34
34
  "Pillow>=12.1.0",
35
+ "opencv-python>=4.8.0",
35
36
  "aiofiles>=25.1.0",
36
37
  "pydantic>=2.10.0",
37
38
  "python-dotenv>=1.2.1",
@@ -1,3 +1,3 @@
1
1
  """Markitai - Document to Markdown converter with LLM enhancement."""
2
2
 
3
- __version__ = "0.3.0"
3
+ __version__ = "0.3.1"
@@ -1037,6 +1037,15 @@ class BatchProcessor:
1037
1037
  self.save_state(force=True)
1038
1038
  return self.state
1039
1039
 
1040
+ # Preheat OCR engine if OCR is enabled to eliminate cold start delay
1041
+ if options and options.get("ocr_enabled"):
1042
+ try:
1043
+ from markitai.ocr import OCRProcessor
1044
+
1045
+ OCRProcessor.preheat()
1046
+ except ImportError:
1047
+ logger.debug("OCR preheat skipped: RapidOCR not installed")
1048
+
1040
1049
  # Create semaphore for concurrency control
1041
1050
  semaphore = asyncio.Semaphore(self.config.concurrency)
1042
1051
 
@@ -978,9 +978,16 @@ def app(
978
978
  # Cleanup shared resources
979
979
  await close_shared_clients() # Close httpx.AsyncClient for Jina
980
980
  shutdown_converter_executor() # Shutdown ThreadPoolExecutor
981
- # Note: FetchCache cleanup happens automatically when process exits
982
- # as SQLite handles connection cleanup. For explicit cleanup, the
983
- # global _fetch_cache.close() could be called, but it's not critical.
981
+
982
+ # Close LiteLLM's aiohttp sessions to prevent "Unclosed connection" warning
983
+ try:
984
+ from litellm.llms.custom_httpx.async_client_cleanup import (
985
+ close_litellm_async_clients,
986
+ )
987
+
988
+ await close_litellm_async_clients()
989
+ except Exception:
990
+ pass # Ignore cleanup errors
984
991
 
985
992
  asyncio.run(run_workflow_with_cleanup())
986
993
 
@@ -1344,13 +1351,18 @@ def cache_stats(as_json: bool, verbose: bool, limit: int, scope: str) -> None:
1344
1351
  default="project",
1345
1352
  help="Which cache to clear (default: project).",
1346
1353
  )
1354
+ @click.option(
1355
+ "--include-spa-domains",
1356
+ is_flag=True,
1357
+ help="Also clear learned SPA domains.",
1358
+ )
1347
1359
  @click.option(
1348
1360
  "--yes",
1349
1361
  "-y",
1350
1362
  is_flag=True,
1351
1363
  help="Skip confirmation prompt.",
1352
1364
  )
1353
- def cache_clear(scope: str, yes: bool) -> None:
1365
+ def cache_clear(scope: str, include_spa_domains: bool, yes: bool) -> None:
1354
1366
  """Clear cache entries."""
1355
1367
  from markitai.constants import (
1356
1368
  DEFAULT_CACHE_DB_FILENAME,
@@ -1368,11 +1380,14 @@ def cache_clear(scope: str, yes: bool) -> None:
1368
1380
  "global": "global cache (~/.markitai)",
1369
1381
  "all": "ALL caches (project + global)",
1370
1382
  }
1371
- if not click.confirm(f"Clear {scope_desc[scope]}?"):
1383
+ desc = scope_desc[scope]
1384
+ if include_spa_domains:
1385
+ desc += " + learned SPA domains"
1386
+ if not click.confirm(f"Clear {desc}?"):
1372
1387
  console.print("[yellow]Aborted[/yellow]")
1373
1388
  return
1374
1389
 
1375
- result = {"project": 0, "global": 0}
1390
+ result = {"project": 0, "global": 0, "spa_domains": 0}
1376
1391
 
1377
1392
  # Clear project cache
1378
1393
  if scope in ("project", "all"):
@@ -1400,18 +1415,105 @@ def cache_clear(scope: str, yes: bool) -> None:
1400
1415
  except Exception as e:
1401
1416
  console.print(f"[red]Failed to clear global cache:[/red] {e}")
1402
1417
 
1418
+ # Clear SPA domains if requested
1419
+ if include_spa_domains:
1420
+ from markitai.fetch import get_spa_domain_cache
1421
+
1422
+ try:
1423
+ spa_cache = get_spa_domain_cache()
1424
+ result["spa_domains"] = spa_cache.clear()
1425
+ except Exception as e:
1426
+ console.print(f"[red]Failed to clear SPA domains:[/red] {e}")
1427
+
1403
1428
  # Report results
1404
1429
  total = result["project"] + result["global"]
1405
- if total > 0:
1430
+ if total > 0 or result["spa_domains"] > 0:
1406
1431
  console.print(f"[green]Cleared {total} cache entries[/green]")
1407
1432
  if result["project"] > 0:
1408
1433
  console.print(f" Project: {result['project']}")
1409
1434
  if result["global"] > 0:
1410
1435
  console.print(f" Global: {result['global']}")
1436
+ if result["spa_domains"] > 0:
1437
+ console.print(f" SPA domains: {result['spa_domains']}")
1411
1438
  else:
1412
1439
  console.print("[dim]No cache entries to clear[/dim]")
1413
1440
 
1414
1441
 
1442
+ @cache.command("spa-domains")
1443
+ @click.option(
1444
+ "--json",
1445
+ "as_json",
1446
+ is_flag=True,
1447
+ help="Output as JSON.",
1448
+ )
1449
+ @click.option(
1450
+ "--clear",
1451
+ is_flag=True,
1452
+ help="Clear all learned SPA domains.",
1453
+ )
1454
+ def cache_spa_domains(as_json: bool, clear: bool) -> None:
1455
+ """View or manage learned SPA domains.
1456
+
1457
+ Shows domains that were automatically detected as requiring browser
1458
+ rendering (JavaScript-heavy sites). These domains will use browser
1459
+ strategy directly on future requests, avoiding wasted static fetch attempts.
1460
+ """
1461
+ from rich.table import Table
1462
+
1463
+ from markitai.fetch import get_spa_domain_cache
1464
+
1465
+ spa_cache = get_spa_domain_cache()
1466
+
1467
+ if clear:
1468
+ count = spa_cache.clear()
1469
+ if as_json:
1470
+ console.print(json.dumps({"cleared": count}))
1471
+ else:
1472
+ console.print(f"[green]Cleared {count} learned SPA domains[/green]")
1473
+ return
1474
+
1475
+ domains = spa_cache.list_domains()
1476
+
1477
+ if as_json:
1478
+ console.print(json.dumps(domains, indent=2, ensure_ascii=False), soft_wrap=True)
1479
+ return
1480
+
1481
+ if not domains:
1482
+ console.print("[dim]No learned SPA domains yet[/dim]")
1483
+ console.print(
1484
+ "\n[dim]Domains are learned automatically when static fetch "
1485
+ "detects JavaScript requirement.[/dim]"
1486
+ )
1487
+ return
1488
+
1489
+ console.print(f"[bold]Learned SPA Domains[/bold] ({len(domains)} total)\n")
1490
+
1491
+ table = Table()
1492
+ table.add_column("Domain", style="cyan")
1493
+ table.add_column("Hits", justify="right")
1494
+ table.add_column("Learned At", style="dim")
1495
+ table.add_column("Last Hit", style="dim")
1496
+ table.add_column("Status")
1497
+
1498
+ for d in domains:
1499
+ status = "[red]Expired[/red]" if d.get("expired") else "[green]Active[/green]"
1500
+ learned_at = d.get("learned_at", "")[:10] if d.get("learned_at") else "-"
1501
+ last_hit = d.get("last_hit", "")[:10] if d.get("last_hit") else "-"
1502
+ table.add_row(
1503
+ d["domain"],
1504
+ str(d.get("hits", 0)),
1505
+ learned_at,
1506
+ last_hit,
1507
+ status,
1508
+ )
1509
+
1510
+ console.print(table)
1511
+ console.print(
1512
+ "\n[dim]Tip: Use --clear to reset learned domains, "
1513
+ "or configure fallback_patterns in config file for permanent rules.[/dim]"
1514
+ )
1515
+
1516
+
1415
1517
  # =============================================================================
1416
1518
  # Check dependencies command
1417
1519
  # =============================================================================
@@ -3097,7 +3199,7 @@ def _check_agent_browser_for_urls(cfg: Any, console: Any) -> None:
3097
3199
  from rich.panel import Panel
3098
3200
 
3099
3201
  warning_text = (
3100
- f"[yellow]{message}[/yellow]\n\n"
3202
+ f"[yellow]{message}[/yellow]\n\n"
3101
3203
  "[dim]URL processing will fall back to static fetch strategy.\n"
3102
3204
  "For JavaScript-rendered pages (Twitter/X, etc.), browser support is recommended.\n\n"
3103
3205
  "To install browser support:[/dim]\n"
@@ -203,17 +203,40 @@ class ScreenshotConfig(BaseModel):
203
203
 
204
204
 
205
205
  class PromptsConfig(BaseModel):
206
- """Prompts configuration."""
206
+ """Prompts configuration for custom prompt overrides.
207
+
208
+ Each prompt has a system and user variant:
209
+ - system: Contains instructions and context
210
+ - user: Contains the actual request with content placeholders
211
+ """
207
212
 
208
213
  dir: str = DEFAULT_PROMPTS_DIR
209
- cleaner: str | None = None
210
- frontmatter: str | None = None
211
- image_caption: str | None = None
212
- image_description: str | None = None
213
- image_analysis: str | None = None # Combined caption + description
214
- page_content: str | None = None # Page content extraction
215
- document_enhance: str | None = None # Document enhancement with vision
216
- url_enhance: str | None = None # URL/web page content enhancement
214
+ # Cleaner prompts
215
+ cleaner_system: str | None = None
216
+ cleaner_user: str | None = None
217
+ # Frontmatter prompts
218
+ frontmatter_system: str | None = None
219
+ frontmatter_user: str | None = None
220
+ # Image prompts
221
+ image_caption_system: str | None = None
222
+ image_caption_user: str | None = None
223
+ image_description_system: str | None = None
224
+ image_description_user: str | None = None
225
+ image_analysis_system: str | None = None
226
+ image_analysis_user: str | None = None
227
+ # Page content prompts
228
+ page_content_system: str | None = None
229
+ page_content_user: str | None = None
230
+ # Document prompts
231
+ document_enhance_system: str | None = None
232
+ document_enhance_user: str | None = None
233
+ document_enhance_complete_system: str | None = None
234
+ document_enhance_complete_user: str | None = None
235
+ document_process_system: str | None = None
236
+ document_process_user: str | None = None
237
+ # URL prompts
238
+ url_enhance_system: str | None = None
239
+ url_enhance_user: str | None = None
217
240
 
218
241
 
219
242
  class BatchConfig(BaseModel):