markitai 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {markitai-0.3.0 → markitai-0.4.1}/.gitignore +7 -7
  2. markitai-0.4.1/PKG-INFO +196 -0
  3. markitai-0.4.1/README.md +147 -0
  4. {markitai-0.3.0 → markitai-0.4.1}/pyproject.toml +17 -7
  5. markitai-0.4.1/src/markitai/__init__.py +3 -0
  6. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/batch.py +50 -17
  7. markitai-0.4.1/src/markitai/cli/__init__.py +52 -0
  8. markitai-0.4.1/src/markitai/cli/commands/__init__.py +18 -0
  9. markitai-0.4.1/src/markitai/cli/commands/cache.py +292 -0
  10. markitai-0.4.1/src/markitai/cli/commands/config.py +240 -0
  11. markitai-0.4.1/src/markitai/cli/commands/doctor.py +561 -0
  12. markitai-0.4.1/src/markitai/cli/console.py +50 -0
  13. markitai-0.4.1/src/markitai/cli/framework.py +130 -0
  14. markitai-0.4.1/src/markitai/cli/logging_config.py +377 -0
  15. markitai-0.4.1/src/markitai/cli/main.py +1036 -0
  16. markitai-0.4.1/src/markitai/cli/processors/__init__.py +47 -0
  17. markitai-0.4.1/src/markitai/cli/processors/batch.py +877 -0
  18. markitai-0.4.1/src/markitai/cli/processors/file.py +226 -0
  19. markitai-0.4.1/src/markitai/cli/processors/llm.py +383 -0
  20. markitai-0.4.1/src/markitai/cli/processors/url.py +1050 -0
  21. markitai-0.4.1/src/markitai/cli/processors/validators.py +265 -0
  22. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/config.py +114 -33
  23. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/config.schema.json +98 -106
  24. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/constants.py +53 -11
  25. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/_patches.py +1 -1
  26. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/pdf.py +8 -4
  27. markitai-0.4.1/src/markitai/fetch.py +2324 -0
  28. markitai-0.4.1/src/markitai/fetch_playwright.py +482 -0
  29. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/image.py +189 -7
  30. markitai-0.4.1/src/markitai/llm/__init__.py +100 -0
  31. markitai-0.4.1/src/markitai/llm/cache.py +521 -0
  32. markitai-0.4.1/src/markitai/llm/content.py +632 -0
  33. markitai-0.4.1/src/markitai/llm/document.py +1525 -0
  34. markitai-0.4.1/src/markitai/llm/models.py +205 -0
  35. markitai-0.4.1/src/markitai/llm/processor.py +2361 -0
  36. markitai-0.4.1/src/markitai/llm/types.py +201 -0
  37. markitai-0.4.1/src/markitai/llm/vision.py +866 -0
  38. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/ocr.py +105 -35
  39. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/prompts/__init__.py +30 -11
  40. markitai-0.4.1/src/markitai/prompts/cleaner_system.md +43 -0
  41. markitai-0.4.1/src/markitai/prompts/cleaner_user.md +3 -0
  42. markitai-0.4.1/src/markitai/prompts/document_enhance_complete_system.md +104 -0
  43. markitai-0.4.1/src/markitai/prompts/document_enhance_complete_user.md +5 -0
  44. markitai-0.3.0/src/markitai/prompts/document_enhance.md → markitai-0.4.1/src/markitai/prompts/document_enhance_system.md +40 -8
  45. markitai-0.4.1/src/markitai/prompts/document_enhance_user.md +5 -0
  46. markitai-0.4.1/src/markitai/prompts/document_process_system.md +56 -0
  47. markitai-0.4.1/src/markitai/prompts/document_process_user.md +5 -0
  48. markitai-0.4.1/src/markitai/prompts/document_vision_system.md +117 -0
  49. markitai-0.4.1/src/markitai/prompts/document_vision_user.md +5 -0
  50. markitai-0.3.0/src/markitai/prompts/image_analysis.md → markitai-0.4.1/src/markitai/prompts/image_analysis_system.md +10 -4
  51. markitai-0.4.1/src/markitai/prompts/image_analysis_user.md +1 -0
  52. markitai-0.3.0/src/markitai/prompts/image_caption.md → markitai-0.4.1/src/markitai/prompts/image_caption_system.md +6 -2
  53. markitai-0.4.1/src/markitai/prompts/image_caption_user.md +1 -0
  54. markitai-0.3.0/src/markitai/prompts/image_description.md → markitai-0.4.1/src/markitai/prompts/image_description_system.md +6 -3
  55. markitai-0.4.1/src/markitai/prompts/image_description_user.md +1 -0
  56. markitai-0.3.0/src/markitai/prompts/page_content.md → markitai-0.4.1/src/markitai/prompts/page_content_system.md +5 -2
  57. markitai-0.4.1/src/markitai/prompts/page_content_user.md +1 -0
  58. markitai-0.4.1/src/markitai/prompts/screenshot_extract_system.md +76 -0
  59. markitai-0.4.1/src/markitai/prompts/screenshot_extract_user.md +1 -0
  60. markitai-0.3.0/src/markitai/prompts/url_enhance.md → markitai-0.4.1/src/markitai/prompts/url_enhance_system.md +46 -14
  61. markitai-0.4.1/src/markitai/prompts/url_enhance_user.md +5 -0
  62. markitai-0.4.1/src/markitai/providers/__init__.py +695 -0
  63. markitai-0.4.1/src/markitai/providers/auth.py +351 -0
  64. markitai-0.4.1/src/markitai/providers/claude_agent.py +649 -0
  65. markitai-0.4.1/src/markitai/providers/copilot.py +844 -0
  66. markitai-0.4.1/src/markitai/providers/errors.py +225 -0
  67. markitai-0.4.1/src/markitai/providers/json_mode.py +217 -0
  68. markitai-0.4.1/src/markitai/providers/timeout.py +169 -0
  69. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/security.py +88 -6
  70. markitai-0.4.1/src/markitai/utils/__init__.py +69 -0
  71. markitai-0.4.1/src/markitai/utils/cli_helpers.py +171 -0
  72. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/utils/executor.py +31 -1
  73. markitai-0.4.1/src/markitai/utils/frontmatter.py +315 -0
  74. markitai-0.4.1/src/markitai/utils/progress.py +92 -0
  75. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/utils/text.py +145 -0
  76. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/workflow/core.py +74 -36
  77. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/workflow/helpers.py +46 -16
  78. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/workflow/single.py +97 -46
  79. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/sub_dir/file_example_PPT_250kB.ppt +0 -0
  80. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/sub_dir/file_example_XLS_100.xls +0 -0
  81. {markitai-0.3.0 → markitai-0.4.1}/tests/integration/test_cache.py +29 -232
  82. {markitai-0.3.0 → markitai-0.4.1}/tests/integration/test_cli.py +1 -1
  83. markitai-0.4.1/tests/integration/test_cli_full.py +914 -0
  84. markitai-0.4.1/tests/integration/test_local_providers.py +855 -0
  85. {markitai-0.3.0 → markitai-0.4.1}/tests/integration/test_output_format.py +5 -4
  86. markitai-0.4.1/tests/integration/test_real_scenarios.py +379 -0
  87. markitai-0.4.1/tests/unit/test_batch_processor.py +1368 -0
  88. markitai-0.4.1/tests/unit/test_cache_cli.py +491 -0
  89. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_cli_helpers.py +13 -5
  90. markitai-0.4.1/tests/unit/test_cli_main.py +867 -0
  91. markitai-0.4.1/tests/unit/test_config_cli.py +282 -0
  92. markitai-0.4.1/tests/unit/test_converter_pdf.py +889 -0
  93. markitai-0.4.1/tests/unit/test_deps_cli.py +742 -0
  94. markitai-0.4.1/tests/unit/test_doctor_cli.py +366 -0
  95. markitai-0.4.1/tests/unit/test_document_utils.py +1731 -0
  96. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_executor.py +53 -2
  97. markitai-0.4.1/tests/unit/test_fetch.py +3230 -0
  98. markitai-0.4.1/tests/unit/test_fetch_playwright.py +1145 -0
  99. markitai-0.4.1/tests/unit/test_frontmatter.py +422 -0
  100. markitai-0.4.1/tests/unit/test_image.py +2453 -0
  101. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_llm.py +63 -79
  102. markitai-0.4.1/tests/unit/test_llm_content.py +266 -0
  103. markitai-0.4.1/tests/unit/test_llm_models.py +545 -0
  104. markitai-0.4.1/tests/unit/test_llm_processor.py +1337 -0
  105. markitai-0.4.1/tests/unit/test_llm_processor_cli.py +1172 -0
  106. markitai-0.4.1/tests/unit/test_ocr.py +355 -0
  107. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_prompts.py +26 -44
  108. markitai-0.4.1/tests/unit/test_provider_auth.py +512 -0
  109. markitai-0.4.1/tests/unit/test_provider_errors.py +307 -0
  110. markitai-0.4.1/tests/unit/test_provider_json_mode.py +236 -0
  111. markitai-0.4.1/tests/unit/test_provider_timeout.py +302 -0
  112. markitai-0.4.1/tests/unit/test_providers.py +1274 -0
  113. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_schema_sync.py +11 -13
  114. markitai-0.4.1/tests/unit/test_security.py +873 -0
  115. markitai-0.4.1/tests/unit/test_url_processor.py +878 -0
  116. markitai-0.4.1/tests/unit/test_utils_text.py +248 -0
  117. markitai-0.4.1/tests/unit/test_vision_mixin.py +1493 -0
  118. markitai-0.4.1/tests/unit/test_vision_utils.py +54 -0
  119. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_workflow_core.py +783 -0
  120. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_workflow_helpers.py +278 -0
  121. markitai-0.4.1/tests/unit/test_workflow_single.py +711 -0
  122. markitai-0.3.0/PKG-INFO +0 -159
  123. markitai-0.3.0/README.md +0 -120
  124. markitai-0.3.0/src/markitai/__init__.py +0 -3
  125. markitai-0.3.0/src/markitai/cli.py +0 -3979
  126. markitai-0.3.0/src/markitai/fetch.py +0 -1725
  127. markitai-0.3.0/src/markitai/llm.py +0 -4339
  128. markitai-0.3.0/src/markitai/prompts/cleaner.md +0 -93
  129. markitai-0.3.0/src/markitai/prompts/document_enhance_complete.md +0 -65
  130. markitai-0.3.0/src/markitai/prompts/document_process.md +0 -60
  131. markitai-0.3.0/src/markitai/prompts/frontmatter.md +0 -28
  132. markitai-0.3.0/src/markitai/utils/__init__.py +0 -33
  133. markitai-0.3.0/tests/unit/test_fetch.py +0 -360
  134. markitai-0.3.0/tests/unit/test_image.py +0 -568
  135. markitai-0.3.0/tests/unit/test_ocr.py +0 -209
  136. markitai-0.3.0/tests/unit/test_security.py +0 -324
  137. markitai-0.3.0/tests/unit/test_workflow_single.py +0 -353
  138. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/__init__.py +0 -0
  139. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/base.py +0 -0
  140. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/image.py +0 -0
  141. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/legacy.py +0 -0
  142. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/office.py +0 -0
  143. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/converter/text.py +0 -0
  144. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/json_order.py +0 -0
  145. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/types.py +0 -0
  146. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/urls.py +0 -0
  147. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/utils/mime.py +0 -0
  148. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/utils/office.py +0 -0
  149. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/utils/output.py +0 -0
  150. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/utils/paths.py +0 -0
  151. {markitai-0.3.0 → markitai-0.4.1}/src/markitai/workflow/__init__.py +0 -0
  152. {markitai-0.3.0 → markitai-0.4.1}/tests/SKILL.md +0 -0
  153. {markitai-0.3.0 → markitai-0.4.1}/tests/__init__.py +0 -0
  154. {markitai-0.3.0 → markitai-0.4.1}/tests/conftest.py +0 -0
  155. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/Free_Test_Data_500KB_PPTX.pptx +0 -0
  156. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/candy.JPG +0 -0
  157. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/file-example_PDF_500_kB.pdf +0 -0
  158. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/file_example_XLSX_100.xlsx +0 -0
  159. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/sub_dir/file-sample_100kB.doc +0 -0
  160. {markitai-0.3.0 → markitai-0.4.1}/tests/fixtures/test.urls +0 -0
  161. {markitai-0.3.0 → markitai-0.4.1}/tests/integration/__init__.py +0 -0
  162. {markitai-0.3.0 → markitai-0.4.1}/tests/integration/test_url.py +0 -0
  163. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/__init__.py +0 -0
  164. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_atomic.py +0 -0
  165. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_batch.py +0 -0
  166. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_config.py +0 -0
  167. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_converter.py +0 -0
  168. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_image_converter.py +0 -0
  169. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_json_order.py +0 -0
  170. {markitai-0.3.0 → markitai-0.4.1}/tests/unit/test_llm_runtime.py +0 -0
@@ -38,6 +38,7 @@ ENV/
38
38
  # Testing
39
39
  .pytest_cache/
40
40
  .coverage
41
+ coverage.xml
41
42
  htmlcov/
42
43
  .tox/
43
44
  .nox/
@@ -46,6 +47,9 @@ htmlcov/
46
47
  .mypy_cache/
47
48
  .pytype/
48
49
 
50
+ # Linting
51
+ .ruff_cache/
52
+
49
53
  # Markitai output
50
54
  output/
51
55
  output-*/
@@ -55,6 +59,7 @@ markitai.json
55
59
 
56
60
  # Logs
57
61
  logs/
62
+ logs_*/
58
63
  *.log
59
64
 
60
65
  # Environment variables (API keys)
@@ -66,13 +71,8 @@ logs/
66
71
  .DS_Store
67
72
  Thumbs.db
68
73
 
69
- # SQLite cache (including WAL mode files)
70
- cache.db
71
- cache.db-wal
72
- cache.db-shm
73
- *.db-wal
74
- *.db-shm
75
- fetch_cache.db
74
+ # Markitai cache directory
75
+ .markitai/
76
76
 
77
77
  # VitePress (website)
78
78
  website/node_modules/
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: markitai
3
+ Version: 0.4.1
4
+ Summary: Opinionated Markdown converter with native LLM enhancement support
5
+ Project-URL: Homepage, https://markitai.ynewtime.com
6
+ Project-URL: Documentation, https://markitai.ynewtime.com/guide/getting-started
7
+ Project-URL: Repository, https://github.com/Ynewtime/markitai
8
+ Project-URL: Changelog, https://github.com/Ynewtime/markitai/blob/main/CHANGELOG.md
9
+ Author-email: Ynewtime <longqiliuye@gmail.com>
10
+ License-Expression: MIT
11
+ Keywords: converter,docx,llm,markdown,ocr,pdf
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Classifier: Topic :: Utilities
23
+ Requires-Python: <3.14,>=3.11
24
+ Requires-Dist: aiofiles>=25.1.0
25
+ Requires-Dist: click>=8.1.0
26
+ Requires-Dist: instructor>=1.14.0
27
+ Requires-Dist: litellm>=1.80.16
28
+ Requires-Dist: loguru>=0.7.3
29
+ Requires-Dist: markitdown[all]>=0.1.4
30
+ Requires-Dist: opencv-python>=4.8.0
31
+ Requires-Dist: pillow>=12.1.0
32
+ Requires-Dist: pydantic>=2.10.0
33
+ Requires-Dist: pymupdf4llm>=0.2.9
34
+ Requires-Dist: python-dotenv>=1.2.1
35
+ Requires-Dist: pywin32>=310; sys_platform == 'win32'
36
+ Requires-Dist: rapidocr>=3.5.0
37
+ Requires-Dist: rich>=14.2.0
38
+ Provides-Extra: all
39
+ Requires-Dist: claude-agent-sdk>=0.1.0; extra == 'all'
40
+ Requires-Dist: github-copilot-sdk>=0.1.0; extra == 'all'
41
+ Requires-Dist: playwright>=1.50.0; extra == 'all'
42
+ Provides-Extra: browser
43
+ Requires-Dist: playwright>=1.50.0; extra == 'browser'
44
+ Provides-Extra: claude-agent
45
+ Requires-Dist: claude-agent-sdk>=0.1.0; extra == 'claude-agent'
46
+ Provides-Extra: copilot
47
+ Requires-Dist: github-copilot-sdk>=0.1.0; extra == 'copilot'
48
+ Description-Content-Type: text/markdown
49
+
50
+ # Markitai
51
+
52
+ English | [简体中文](./README_ZH.md)
53
+
54
+ Opinionated Markdown converter with native LLM enhancement support.
55
+
56
+ ## Features
57
+
58
+ - **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
59
+ - **LLM Enhancement** - Format cleaning, metadata generation, image analysis
60
+ - **Batch Processing** - Concurrent conversion, resume capability, progress display
61
+ - **OCR Recognition** - Text extraction from scanned PDFs and images
62
+ - **URL Conversion** - Direct webpage conversion with SPA browser rendering support
63
+ - **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
64
+
65
+ ## Installation
66
+
67
+ ### One-Click Setup (Recommended)
68
+
69
+ ```bash
70
+ # Linux/macOS
71
+ curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
72
+
73
+ # Windows (PowerShell)
74
+ irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
75
+ ```
76
+
77
+ ### Manual Installation
78
+
79
+ ```bash
80
+ # Requires Python 3.11-3.13 (3.14 not yet supported)
81
+ uv tool install markitai
82
+
83
+ # Or using uv pip (for virtual environment)
84
+ uv pip install markitai
85
+ ```
86
+
87
+ ## Quick Start
88
+
89
+ ```bash
90
+ # Basic conversion
91
+ markitai document.docx
92
+
93
+ # URL conversion
94
+ markitai https://example.com/article
95
+
96
+ # LLM enhancement
97
+ markitai document.docx --llm
98
+
99
+ # Using presets
100
+ markitai document.pdf --preset rich # LLM + alt + desc + screenshot
101
+ markitai document.pdf --preset standard # LLM + alt + desc
102
+ markitai document.pdf --preset minimal # Basic conversion only
103
+
104
+ # Batch processing
105
+ markitai ./docs -o ./output
106
+
107
+ # Resume interrupted job
108
+ markitai ./docs -o ./output --resume
109
+
110
+ # Batch URL processing (auto-detect .urls files)
111
+ markitai urls.urls -o ./output
112
+ ```
113
+
114
+ ## Output Structure
115
+
116
+ ```
117
+ output/
118
+ ├── document.docx.md # Basic Markdown
119
+ ├── document.docx.llm.md # LLM-enhanced version
120
+ ├── assets/
121
+ │ ├── document.docx.0001.jpg
122
+ │ └── images.json # Image descriptions
123
+ ├── screenshots/ # Page screenshots (with --screenshot)
124
+ │ └── example_com.full.jpg
125
+ ```
126
+
127
+ ## Configuration
128
+
129
+ Priority: CLI arguments > Environment variables > Config file > Defaults
130
+
131
+ ```bash
132
+ # View configuration
133
+ markitai config list
134
+
135
+ # Initialize config file
136
+ markitai config init -o .
137
+
138
+ # View cache status
139
+ markitai cache stats
140
+
141
+ # Clear cache
142
+ markitai cache clear
143
+
144
+ # Check system health and dependencies
145
+ markitai doctor
146
+ ```
147
+
148
+ Config file location: `./markitai.json` or `~/.markitai/config.json`
149
+
150
+ ### Local Providers (Subscription-based)
151
+
152
+ Use your existing Claude Code or GitHub Copilot subscription:
153
+
154
+ ```bash
155
+ # Claude Agent (requires Claude Code CLI)
156
+ markitai document.pdf --llm # Configure claude-agent/sonnet in config
157
+
158
+ # GitHub Copilot (requires Copilot CLI)
159
+ markitai document.pdf --llm # Configure copilot/gpt-5.2 in config
160
+ ```
161
+
162
+ Install CLI tools:
163
+ ```bash
164
+ # Claude Code CLI
165
+ curl -fsSL https://claude.ai/install.sh | bash
166
+
167
+ # GitHub Copilot CLI
168
+ curl -fsSL https://gh.io/copilot-install | bash
169
+ ```
170
+
171
+ ## Environment Variables
172
+
173
+ | Variable | Description |
174
+ |----------|-------------|
175
+ | `OPENAI_API_KEY` | OpenAI API Key |
176
+ | `GEMINI_API_KEY` | Google Gemini API Key |
177
+ | `DEEPSEEK_API_KEY` | DeepSeek API Key |
178
+ | `ANTHROPIC_API_KEY` | Anthropic API Key |
179
+ | `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
180
+
181
+ ## Dependencies
182
+
183
+ - [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
184
+ - [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
185
+ - [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
186
+ - [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
187
+
188
+ ## Documentation
189
+
190
+ - [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
191
+ - [Configuration](https://markitai.ynewtime.com/guide/configuration)
192
+ - [CLI Reference](https://markitai.ynewtime.com/guide/cli)
193
+
194
+ ## License
195
+
196
+ MIT
@@ -0,0 +1,147 @@
1
+ # Markitai
2
+
3
+ English | [简体中文](./README_ZH.md)
4
+
5
+ Opinionated Markdown converter with native LLM enhancement support.
6
+
7
+ ## Features
8
+
9
+ - **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
10
+ - **LLM Enhancement** - Format cleaning, metadata generation, image analysis
11
+ - **Batch Processing** - Concurrent conversion, resume capability, progress display
12
+ - **OCR Recognition** - Text extraction from scanned PDFs and images
13
+ - **URL Conversion** - Direct webpage conversion with SPA browser rendering support
14
+ - **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
15
+
16
+ ## Installation
17
+
18
+ ### One-Click Setup (Recommended)
19
+
20
+ ```bash
21
+ # Linux/macOS
22
+ curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
23
+
24
+ # Windows (PowerShell)
25
+ irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
26
+ ```
27
+
28
+ ### Manual Installation
29
+
30
+ ```bash
31
+ # Requires Python 3.11-3.13 (3.14 not yet supported)
32
+ uv tool install markitai
33
+
34
+ # Or using uv pip (for virtual environment)
35
+ uv pip install markitai
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ ```bash
41
+ # Basic conversion
42
+ markitai document.docx
43
+
44
+ # URL conversion
45
+ markitai https://example.com/article
46
+
47
+ # LLM enhancement
48
+ markitai document.docx --llm
49
+
50
+ # Using presets
51
+ markitai document.pdf --preset rich # LLM + alt + desc + screenshot
52
+ markitai document.pdf --preset standard # LLM + alt + desc
53
+ markitai document.pdf --preset minimal # Basic conversion only
54
+
55
+ # Batch processing
56
+ markitai ./docs -o ./output
57
+
58
+ # Resume interrupted job
59
+ markitai ./docs -o ./output --resume
60
+
61
+ # Batch URL processing (auto-detect .urls files)
62
+ markitai urls.urls -o ./output
63
+ ```
64
+
65
+ ## Output Structure
66
+
67
+ ```
68
+ output/
69
+ ├── document.docx.md # Basic Markdown
70
+ ├── document.docx.llm.md # LLM-enhanced version
71
+ ├── assets/
72
+ │ ├── document.docx.0001.jpg
73
+ │ └── images.json # Image descriptions
74
+ ├── screenshots/ # Page screenshots (with --screenshot)
75
+ │ └── example_com.full.jpg
76
+ ```
77
+
78
+ ## Configuration
79
+
80
+ Priority: CLI arguments > Environment variables > Config file > Defaults
81
+
82
+ ```bash
83
+ # View configuration
84
+ markitai config list
85
+
86
+ # Initialize config file
87
+ markitai config init -o .
88
+
89
+ # View cache status
90
+ markitai cache stats
91
+
92
+ # Clear cache
93
+ markitai cache clear
94
+
95
+ # Check system health and dependencies
96
+ markitai doctor
97
+ ```
98
+
99
+ Config file location: `./markitai.json` or `~/.markitai/config.json`
100
+
101
+ ### Local Providers (Subscription-based)
102
+
103
+ Use your existing Claude Code or GitHub Copilot subscription:
104
+
105
+ ```bash
106
+ # Claude Agent (requires Claude Code CLI)
107
+ markitai document.pdf --llm # Configure claude-agent/sonnet in config
108
+
109
+ # GitHub Copilot (requires Copilot CLI)
110
+ markitai document.pdf --llm # Configure copilot/gpt-5.2 in config
111
+ ```
112
+
113
+ Install CLI tools:
114
+ ```bash
115
+ # Claude Code CLI
116
+ curl -fsSL https://claude.ai/install.sh | bash
117
+
118
+ # GitHub Copilot CLI
119
+ curl -fsSL https://gh.io/copilot-install | bash
120
+ ```
121
+
122
+ ## Environment Variables
123
+
124
+ | Variable | Description |
125
+ |----------|-------------|
126
+ | `OPENAI_API_KEY` | OpenAI API Key |
127
+ | `GEMINI_API_KEY` | Google Gemini API Key |
128
+ | `DEEPSEEK_API_KEY` | DeepSeek API Key |
129
+ | `ANTHROPIC_API_KEY` | Anthropic API Key |
130
+ | `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
131
+
132
+ ## Dependencies
133
+
134
+ - [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
135
+ - [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
136
+ - [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
137
+ - [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
138
+
139
+ ## Documentation
140
+
141
+ - [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
142
+ - [Configuration](https://markitai.ynewtime.com/guide/configuration)
143
+ - [CLI Reference](https://markitai.ynewtime.com/guide/cli)
144
+
145
+ ## License
146
+
147
+ MIT
@@ -1,10 +1,10 @@
1
1
  [project]
2
2
  name = "markitai"
3
- version = "0.3.0"
4
- description = "Document to Markdown converter with LLM enhancement"
3
+ version = "0.4.1"
4
+ description = "Opinionated Markdown converter with native LLM enhancement support"
5
5
  license = "MIT"
6
6
  readme = "README.md"
7
- requires-python = ">=3.11"
7
+ requires-python = ">=3.11,<3.14"
8
8
  authors = [
9
9
  { name = "Ynewtime", email = "longqiliuye@gmail.com" }
10
10
  ]
@@ -32,6 +32,7 @@ dependencies = [
32
32
  "loguru>=0.7.3",
33
33
  "rich>=14.2.0",
34
34
  "Pillow>=12.1.0",
35
+ "opencv-python>=4.8.0",
35
36
  "aiofiles>=25.1.0",
36
37
  "pydantic>=2.10.0",
37
38
  "python-dotenv>=1.2.1",
@@ -48,7 +49,10 @@ Changelog = "https://github.com/Ynewtime/markitai/blob/main/CHANGELOG.md"
48
49
  markitai = "markitai.cli:app"
49
50
 
50
51
  [project.optional-dependencies]
51
- all = []
52
+ claude-agent = ["claude-agent-sdk>=0.1.0"]
53
+ copilot = ["github-copilot-sdk>=0.1.0"]
54
+ browser = ["playwright>=1.50.0"]
55
+ all = ["claude-agent-sdk>=0.1.0", "github-copilot-sdk>=0.1.0", "playwright>=1.50.0"]
52
56
 
53
57
  [dependency-groups]
54
58
  dev = [
@@ -71,9 +75,13 @@ packages = ["src/markitai"]
71
75
  testpaths = ["tests"]
72
76
  asyncio_mode = "auto"
73
77
  asyncio_default_fixture_loop_scope = "function"
78
+ markers = [
79
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
80
+ "network: marks tests that require network access (deselect with '-m \"not network\"')",
81
+ ]
74
82
 
75
83
  [tool.ruff]
76
- target-version = "py311"
84
+ target-version = "py313"
77
85
  line-length = 88
78
86
  src = ["src", "tests"]
79
87
 
@@ -114,13 +122,15 @@ skip-magic-trailing-comma = false
114
122
  line-ending = "auto"
115
123
 
116
124
  [tool.pyright]
117
- pythonVersion = "3.11"
125
+ pythonVersion = "3.13"
118
126
  typeCheckingMode = "basic"
119
127
  include = ["src"]
120
128
  exclude = ["tests", "**/__pycache__"]
121
129
  venvPath = "../.."
122
130
  venv = ".venv"
123
- reportMissingImports = true
131
+ # Allow optional dependencies to be missing (claude-agent-sdk)
132
+ # These are runtime-checked before import using importlib.util.find_spec
133
+ reportMissingImports = "warning"
124
134
  reportMissingTypeStubs = false
125
135
  reportUnusedImport = true
126
136
  reportUnusedVariable = "warning"
@@ -0,0 +1,3 @@
1
+ """Markitai - Opinionated Markdown converter with native LLM enhancement support."""
2
+
3
+ __version__ = "0.4.1"
@@ -13,7 +13,7 @@ from pathlib import Path
13
13
  from typing import TYPE_CHECKING, Any
14
14
 
15
15
  from loguru import logger
16
- from rich.console import Console, Group
16
+ from rich.console import Group
17
17
  from rich.live import Live
18
18
  from rich.panel import Panel
19
19
  from rich.progress import (
@@ -28,9 +28,11 @@ from rich.progress import (
28
28
  from rich.table import Table
29
29
  from rich.text import Text
30
30
 
31
+ from markitai.cli.console import get_console
31
32
  from markitai.constants import DEFAULT_LOG_PANEL_MAX_LINES
32
33
  from markitai.json_order import order_report, order_state
33
34
  from markitai.security import atomic_write_json
35
+ from markitai.utils.text import format_error_message
34
36
 
35
37
  if TYPE_CHECKING:
36
38
  from markitai.config import BatchConfig
@@ -464,10 +466,15 @@ class BatchProcessor:
464
466
  self.state_file = self._get_state_file_path()
465
467
  self.report_file = self._get_report_file_path()
466
468
  self.state: BatchState | None = None
467
- self.console = Console()
469
+ self.console = get_console()
468
470
  # Collect image analysis results for JSON aggregation
469
471
  self.image_analysis_results: list[ImageAnalysisResult] = []
470
472
 
473
+ # Optimization: Lock for state saving to prevent IO congestion
474
+ import threading
475
+
476
+ self._save_lock = threading.Lock()
477
+
471
478
  # Live display state (managed by start_live_display/stop_live_display)
472
479
  self._live: Live | None = None
473
480
  self._log_panel: LogPanel | None = None
@@ -515,7 +522,7 @@ class BatchProcessor:
515
522
  "options": key_options,
516
523
  }
517
524
  hash_str = json.dumps(hash_params, sort_keys=True)
518
- return hashlib.md5(hash_str.encode()).hexdigest()[:6]
525
+ return hashlib.md5(hash_str.encode(), usedforsecurity=False).hexdigest()[:6]
519
526
 
520
527
  def _get_state_file_path(self) -> Path:
521
528
  """Generate state file path for resume capability.
@@ -543,11 +550,17 @@ class BatchProcessor:
543
550
  return base_path
544
551
  else: # rename
545
552
  seq = 2
546
- while True:
553
+ max_seq = 9999 # Safety limit to prevent infinite loop
554
+ while seq <= max_seq:
547
555
  new_path = reports_dir / f"markitai.{self.task_hash}.v{seq}.report.json"
548
556
  if not new_path.exists():
549
557
  return new_path
550
558
  seq += 1
559
+ # Fallback: use timestamp if too many versions exist
560
+ import time
561
+
562
+ ts = int(time.time())
563
+ return reports_dir / f"markitai.{self.task_hash}.{ts}.report.json"
551
564
 
552
565
  def start_live_display(
553
566
  self,
@@ -807,6 +820,7 @@ class BatchProcessor:
807
820
  Optimized with interval-based throttling:
808
821
  - Checks interval BEFORE serialization to avoid unnecessary work
809
822
  - Uses minimal serialization when possible
823
+ - Uses thread lock to prevent concurrent disk writes
810
824
 
811
825
  Args:
812
826
  force: Force save even if interval hasn't passed
@@ -816,27 +830,35 @@ class BatchProcessor:
816
830
  return
817
831
 
818
832
  now = datetime.now().astimezone()
819
- interval = getattr(self.config, "state_flush_interval_seconds", 0) or 0
833
+ # Default to 5 seconds if not specified in config to prevent $O(N^2)$ IO
834
+ interval = getattr(self.config, "state_flush_interval_seconds", 5) or 5
820
835
 
821
836
  # Check interval BEFORE any serialization work (optimization)
822
- if not force and interval > 0:
837
+ if not force:
823
838
  last_saved = getattr(self, "_last_state_save", None)
824
839
  if last_saved and (now - last_saved).total_seconds() < interval:
825
840
  return # Skip: interval not passed, no work done
826
841
 
827
- self.state.updated_at = now.isoformat()
842
+ # Ensure only one thread is writing at a time
843
+ if not self._save_lock.acquire(blocking=force):
844
+ return # Skip if another thread is already saving, unless forced
845
+
846
+ try:
847
+ self.state.updated_at = now.isoformat()
828
848
 
829
- # Build minimal state document (only what's needed for resume)
830
- state_data = self.state.to_minimal_dict()
849
+ # Build minimal state document (only what's needed for resume)
850
+ state_data = self.state.to_minimal_dict()
831
851
 
832
- # Ensure states directory exists
833
- self.state_file.parent.mkdir(parents=True, exist_ok=True)
852
+ # Ensure states directory exists
853
+ self.state_file.parent.mkdir(parents=True, exist_ok=True)
834
854
 
835
- atomic_write_json(self.state_file, state_data, order_func=order_state)
836
- self._last_state_save = now
855
+ atomic_write_json(self.state_file, state_data, order_func=order_state)
856
+ self._last_state_save = now
837
857
 
838
- if log:
839
- logger.info(f"State file saved: {self.state_file.resolve()}")
858
+ if log:
859
+ logger.info(f"State file saved: {self.state_file.resolve()}")
860
+ finally:
861
+ self._save_lock.release()
840
862
 
841
863
  def _compute_summary(self) -> dict[str, Any]:
842
864
  """Compute summary statistics for report."""
@@ -1037,6 +1059,15 @@ class BatchProcessor:
1037
1059
  self.save_state(force=True)
1038
1060
  return self.state
1039
1061
 
1062
+ # Preheat OCR engine if OCR is enabled to eliminate cold start delay
1063
+ if options and options.get("ocr_enabled"):
1064
+ try:
1065
+ from markitai.ocr import OCRProcessor
1066
+
1067
+ OCRProcessor.preheat()
1068
+ except ImportError:
1069
+ logger.debug("OCR preheat skipped: RapidOCR not installed")
1070
+
1040
1071
  # Create semaphore for concurrency control
1041
1072
  semaphore = asyncio.Semaphore(self.config.concurrency)
1042
1073
 
@@ -1126,8 +1157,10 @@ class BatchProcessor:
1126
1157
 
1127
1158
  except Exception as e:
1128
1159
  file_state.status = FileStatus.FAILED
1129
- file_state.error = str(e)
1130
- logger.error(f"Failed to process {file_path.name}: {e}")
1160
+ file_state.error = format_error_message(e)
1161
+ logger.error(
1162
+ f"Failed to process {file_path.name}: {format_error_message(e)}"
1163
+ )
1131
1164
 
1132
1165
  finally:
1133
1166
  end_time = asyncio.get_event_loop().time()
@@ -0,0 +1,52 @@
1
+ """CLI package for Markitai.
2
+
3
+ This package provides the command-line interface for Markitai.
4
+
5
+ Usage:
6
+ from markitai.cli import app
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ # Re-export CLI app
12
+ from markitai.cli.main import app
13
+
14
+ # Re-export validators from processors
15
+ from markitai.cli.processors.validators import (
16
+ warn_case_sensitivity_mismatches as _warn_case_sensitivity_mismatches,
17
+ )
18
+
19
+ # Re-export utilities from refactored modules
20
+ from markitai.utils.cli_helpers import (
21
+ compute_task_hash,
22
+ get_report_file_path,
23
+ is_url,
24
+ sanitize_filename,
25
+ url_to_filename,
26
+ )
27
+ from markitai.utils.output import resolve_output_path
28
+ from markitai.utils.progress import ProgressReporter
29
+
30
+ # Re-export from workflow helpers
31
+ from markitai.workflow.helpers import write_images_json
32
+
33
+ # Re-export types from workflow for backward compatibility
34
+ from markitai.workflow.single import ImageAnalysisResult
35
+
36
+ # Backward compatibility alias (deprecated, use sanitize_filename instead)
37
+ _sanitize_filename = sanitize_filename
38
+
39
+ __all__ = [
40
+ "app",
41
+ "ProgressReporter",
42
+ "is_url",
43
+ "url_to_filename",
44
+ "sanitize_filename",
45
+ "_sanitize_filename", # Deprecated alias
46
+ "_warn_case_sensitivity_mismatches",
47
+ "compute_task_hash",
48
+ "get_report_file_path",
49
+ "resolve_output_path",
50
+ "write_images_json",
51
+ "ImageAnalysisResult",
52
+ ]
@@ -0,0 +1,18 @@
1
+ """CLI commands package.
2
+
3
+ This package contains CLI command groups for Markitai.
4
+
5
+ Available command groups:
6
+ - config: Configuration management commands
7
+ - cache: Cache management commands
8
+ - doctor: System health and dependency checking command
9
+ - check_deps: Alias for doctor (backward compatibility)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from markitai.cli.commands.cache import cache
15
+ from markitai.cli.commands.config import config
16
+ from markitai.cli.commands.doctor import check_deps, doctor
17
+
18
+ __all__ = ["cache", "config", "doctor", "check_deps"]