docmirror 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. docmirror-0.2.0/.dockerignore +43 -0
  2. docmirror-0.2.0/.env.example +9 -0
  3. docmirror-0.2.0/.github/ISSUE_TEMPLATE/bug_report.yml +69 -0
  4. docmirror-0.2.0/.github/ISSUE_TEMPLATE/feature_request.yml +33 -0
  5. docmirror-0.2.0/.github/pull_request_template.md +23 -0
  6. docmirror-0.2.0/.github/workflows/ci.yml +79 -0
  7. docmirror-0.2.0/.github/workflows/docs.yml +29 -0
  8. docmirror-0.2.0/.github/workflows/publish.yml +35 -0
  9. docmirror-0.2.0/.gitignore +60 -0
  10. docmirror-0.2.0/.pre-commit-config.yaml +21 -0
  11. docmirror-0.2.0/AUTHORS.md +13 -0
  12. docmirror-0.2.0/CHANGELOG.md +25 -0
  13. docmirror-0.2.0/CODE_OF_CONDUCT.md +54 -0
  14. docmirror-0.2.0/CONTRIBUTING.md +114 -0
  15. docmirror-0.2.0/Dockerfile +56 -0
  16. docmirror-0.2.0/LICENSE +201 -0
  17. docmirror-0.2.0/Makefile +36 -0
  18. docmirror-0.2.0/PKG-INFO +202 -0
  19. docmirror-0.2.0/README.md +113 -0
  20. docmirror-0.2.0/SECURITY.md +52 -0
  21. docmirror-0.2.0/docker-compose.yml +34 -0
  22. docmirror-0.2.0/docmirror/__init__.py +64 -0
  23. docmirror-0.2.0/docmirror/__main__.py +254 -0
  24. docmirror-0.2.0/docmirror/adapters/__init__.py +47 -0
  25. docmirror-0.2.0/docmirror/adapters/data/__init__.py +6 -0
  26. docmirror-0.2.0/docmirror/adapters/data/structured.py +80 -0
  27. docmirror-0.2.0/docmirror/adapters/image/__init__.py +6 -0
  28. docmirror-0.2.0/docmirror/adapters/image/image.py +134 -0
  29. docmirror-0.2.0/docmirror/adapters/office/__init__.py +6 -0
  30. docmirror-0.2.0/docmirror/adapters/office/excel.py +113 -0
  31. docmirror-0.2.0/docmirror/adapters/office/omml_extractor.py +111 -0
  32. docmirror-0.2.0/docmirror/adapters/office/ppt.py +107 -0
  33. docmirror-0.2.0/docmirror/adapters/office/word.py +157 -0
  34. docmirror-0.2.0/docmirror/adapters/pdf/__init__.py +6 -0
  35. docmirror-0.2.0/docmirror/adapters/pdf/pdf.py +126 -0
  36. docmirror-0.2.0/docmirror/adapters/web/__init__.py +6 -0
  37. docmirror-0.2.0/docmirror/adapters/web/email.py +115 -0
  38. docmirror-0.2.0/docmirror/adapters/web/web.py +113 -0
  39. docmirror-0.2.0/docmirror/configs/__init__.py +18 -0
  40. docmirror-0.2.0/docmirror/configs/column_aliases.yaml +178 -0
  41. docmirror-0.2.0/docmirror/configs/domain_registry.py +206 -0
  42. docmirror-0.2.0/docmirror/configs/hints.yaml +99 -0
  43. docmirror-0.2.0/docmirror/configs/institution_registry.yaml +164 -0
  44. docmirror-0.2.0/docmirror/configs/key_synonyms.yaml +95 -0
  45. docmirror-0.2.0/docmirror/configs/pipeline_registry.py +108 -0
  46. docmirror-0.2.0/docmirror/configs/settings.py +229 -0
  47. docmirror-0.2.0/docmirror/core/__init__.py +14 -0
  48. docmirror-0.2.0/docmirror/core/exceptions.py +131 -0
  49. docmirror-0.2.0/docmirror/core/extraction/__init__.py +16 -0
  50. docmirror-0.2.0/docmirror/core/extraction/entity_collector.py +31 -0
  51. docmirror-0.2.0/docmirror/core/extraction/extractor.py +2002 -0
  52. docmirror-0.2.0/docmirror/core/extraction/foundation.py +126 -0
  53. docmirror-0.2.0/docmirror/core/extraction/html_utils.py +57 -0
  54. docmirror-0.2.0/docmirror/core/extraction/image_converter.py +48 -0
  55. docmirror-0.2.0/docmirror/core/extraction/pre_analyzer.py +618 -0
  56. docmirror-0.2.0/docmirror/core/extraction/quality_router.py +228 -0
  57. docmirror-0.2.0/docmirror/core/extraction/table_postprocessor.py +97 -0
  58. docmirror-0.2.0/docmirror/core/factory.py +57 -0
  59. docmirror-0.2.0/docmirror/core/layout/__init__.py +7 -0
  60. docmirror-0.2.0/docmirror/core/layout/graph_router.py +421 -0
  61. docmirror-0.2.0/docmirror/core/layout/layout_analysis.py +1437 -0
  62. docmirror-0.2.0/docmirror/core/layout/layout_model.py +197 -0
  63. docmirror-0.2.0/docmirror/core/layout/spatial_graph.py +304 -0
  64. docmirror-0.2.0/docmirror/core/ocr/__init__.py +8 -0
  65. docmirror-0.2.0/docmirror/core/ocr/aistudio_provider.py +146 -0
  66. docmirror-0.2.0/docmirror/core/ocr/fallback.py +1791 -0
  67. docmirror-0.2.0/docmirror/core/ocr/formula_chars.py +261 -0
  68. docmirror-0.2.0/docmirror/core/ocr/formula_engine.py +350 -0
  69. docmirror-0.2.0/docmirror/core/ocr/image_preprocessing.py +369 -0
  70. docmirror-0.2.0/docmirror/core/ocr/ocr_postprocess.py +367 -0
  71. docmirror-0.2.0/docmirror/core/ocr/table_reconstruction.py +335 -0
  72. docmirror-0.2.0/docmirror/core/ocr/vision/__init__.py +7 -0
  73. docmirror-0.2.0/docmirror/core/ocr/vision/rapidocr_engine.py +340 -0
  74. docmirror-0.2.0/docmirror/core/ocr/vision/seal_detector.py +252 -0
  75. docmirror-0.2.0/docmirror/core/security/__init__.py +6 -0
  76. docmirror-0.2.0/docmirror/core/security/forgery_detector.py +184 -0
  77. docmirror-0.2.0/docmirror/core/table/__init__.py +8 -0
  78. docmirror-0.2.0/docmirror/core/table/extraction/__init__.py +60 -0
  79. docmirror-0.2.0/docmirror/core/table/extraction/char_strategy.py +835 -0
  80. docmirror-0.2.0/docmirror/core/table/extraction/classifier.py +225 -0
  81. docmirror-0.2.0/docmirror/core/table/extraction/engine.py +856 -0
  82. docmirror-0.2.0/docmirror/core/table/extraction/grid_tensor.py +94 -0
  83. docmirror-0.2.0/docmirror/core/table/extraction/pdfplumber_strategy.py +170 -0
  84. docmirror-0.2.0/docmirror/core/table/extraction/pipe_strategy.py +234 -0
  85. docmirror-0.2.0/docmirror/core/table/extraction/rapid_table_engine.py +97 -0
  86. docmirror-0.2.0/docmirror/core/table/extraction/signal_processor.py +413 -0
  87. docmirror-0.2.0/docmirror/core/table/extraction/template_injector.py +184 -0
  88. docmirror-0.2.0/docmirror/core/table/extraction/utils.py +231 -0
  89. docmirror-0.2.0/docmirror/core/table/merger.py +181 -0
  90. docmirror-0.2.0/docmirror/core/table/page_state.py +109 -0
  91. docmirror-0.2.0/docmirror/core/table/postprocess.py +744 -0
  92. docmirror-0.2.0/docmirror/core/table/table_structure_fix.py +697 -0
  93. docmirror-0.2.0/docmirror/core/utils/__init__.py +8 -0
  94. docmirror-0.2.0/docmirror/core/utils/text_utils.py +160 -0
  95. docmirror-0.2.0/docmirror/core/utils/vocabulary.py +379 -0
  96. docmirror-0.2.0/docmirror/core/utils/watermark.py +238 -0
  97. docmirror-0.2.0/docmirror/framework/__init__.py +25 -0
  98. docmirror-0.2.0/docmirror/framework/base.py +350 -0
  99. docmirror-0.2.0/docmirror/framework/cache.py +139 -0
  100. docmirror-0.2.0/docmirror/framework/dispatcher.py +351 -0
  101. docmirror-0.2.0/docmirror/framework/orchestrator.py +221 -0
  102. docmirror-0.2.0/docmirror/middlewares/__init__.py +25 -0
  103. docmirror-0.2.0/docmirror/middlewares/alignment/__init__.py +15 -0
  104. docmirror-0.2.0/docmirror/middlewares/alignment/amount_splitter.py +179 -0
  105. docmirror-0.2.0/docmirror/middlewares/alignment/header_alignment.py +209 -0
  106. docmirror-0.2.0/docmirror/middlewares/base.py +346 -0
  107. docmirror-0.2.0/docmirror/middlewares/detection/__init__.py +13 -0
  108. docmirror-0.2.0/docmirror/middlewares/detection/institution_detector.py +169 -0
  109. docmirror-0.2.0/docmirror/middlewares/detection/language_detector.py +57 -0
  110. docmirror-0.2.0/docmirror/middlewares/detection/scene_detector.py +308 -0
  111. docmirror-0.2.0/docmirror/middlewares/extraction/__init__.py +12 -0
  112. docmirror-0.2.0/docmirror/middlewares/extraction/entity_extractor.py +226 -0
  113. docmirror-0.2.0/docmirror/middlewares/extraction/generic_entity_extractor.py +44 -0
  114. docmirror-0.2.0/docmirror/middlewares/validation/__init__.py +12 -0
  115. docmirror-0.2.0/docmirror/middlewares/validation/mutation_analyzer.py +234 -0
  116. docmirror-0.2.0/docmirror/middlewares/validation/validator.py +488 -0
  117. docmirror-0.2.0/docmirror/models/__init__.py +25 -0
  118. docmirror-0.2.0/docmirror/models/construction/__init__.py +11 -0
  119. docmirror-0.2.0/docmirror/models/construction/_shared.py +46 -0
  120. docmirror-0.2.0/docmirror/models/construction/builder.py +341 -0
  121. docmirror-0.2.0/docmirror/models/entities/__init__.py +18 -0
  122. docmirror-0.2.0/docmirror/models/entities/document_types.py +126 -0
  123. docmirror-0.2.0/docmirror/models/entities/domain.py +149 -0
  124. docmirror-0.2.0/docmirror/models/entities/domain_models.py +214 -0
  125. docmirror-0.2.0/docmirror/models/entities/enhanced.py +271 -0
  126. docmirror-0.2.0/docmirror/models/entities/perception_result.py +382 -0
  127. docmirror-0.2.0/docmirror/models/errors.py +103 -0
  128. docmirror-0.2.0/docmirror/models/tracking/__init__.py +11 -0
  129. docmirror-0.2.0/docmirror/models/tracking/mutation.py +93 -0
  130. docmirror-0.2.0/docmirror/plugins/__init__.py +210 -0
  131. docmirror-0.2.0/docmirror/plugins/bank_statement.py +104 -0
  132. docmirror-0.2.0/docmirror/py.typed +0 -0
  133. docmirror-0.2.0/docmirror/server/__init__.py +6 -0
  134. docmirror-0.2.0/docmirror/server/api.py +141 -0
  135. docmirror-0.2.0/docmirror/server/schemas.py +44 -0
  136. docmirror-0.2.0/docs/api/index.md +24 -0
  137. docmirror-0.2.0/docs/changelog.md +50 -0
  138. docmirror-0.2.0/docs/contributing.md +114 -0
  139. docmirror-0.2.0/docs/design/page-concurrency-implementation.md +138 -0
  140. docmirror-0.2.0/docs/design/performance-optimization.md +104 -0
  141. docmirror-0.2.0/docs/design/solution-design.md +378 -0
  142. docmirror-0.2.0/docs/development/testing.md +35 -0
  143. docmirror-0.2.0/docs/getting-started/installation.md +55 -0
  144. docmirror-0.2.0/docs/getting-started/quickstart.md +105 -0
  145. docmirror-0.2.0/docs/guide/architecture.md +84 -0
  146. docmirror-0.2.0/docs/guide/configuration.md +95 -0
  147. docmirror-0.2.0/docs/guide/error-handling.md +38 -0
  148. docmirror-0.2.0/docs/guide/external-ocr.md +142 -0
  149. docmirror-0.2.0/docs/guide/formats.md +15 -0
  150. docmirror-0.2.0/docs/index.md +5 -0
  151. docmirror-0.2.0/docs/integrations.md +48 -0
  152. docmirror-0.2.0/docs/plg-strategies.md +365 -0
  153. docmirror-0.2.0/docs/plugins/creating-plugins.md +70 -0
  154. docmirror-0.2.0/docs/plugins/overview.md +50 -0
  155. docmirror-0.2.0/examples/promo_demo.py +143 -0
  156. docmirror-0.2.0/mkdocs.yml +93 -0
  157. docmirror-0.2.0/pyproject.toml +118 -0
  158. docmirror-0.2.0/tests/README.md +35 -0
  159. docmirror-0.2.0/tests/__init__.py +7 -0
  160. docmirror-0.2.0/tests/benchmark/__init__.py +7 -0
  161. docmirror-0.2.0/tests/benchmark/hybrid_matcher.py +222 -0
  162. docmirror-0.2.0/tests/benchmark/metrics.py +183 -0
  163. docmirror-0.2.0/tests/conftest.py +33 -0
  164. docmirror-0.2.0/tests/fixtures/.gitkeep +0 -0
  165. docmirror-0.2.0/tests/profile_parse.py +131 -0
  166. docmirror-0.2.0/tests/test_domain_registry.py +140 -0
  167. docmirror-0.2.0/tests/test_imports.py +102 -0
  168. docmirror-0.2.0/tests/test_integration.py +337 -0
  169. docmirror-0.2.0/tests/test_models.py +66 -0
  170. docmirror-0.2.0/tests/test_omml_extractor.py +50 -0
  171. docmirror-0.2.0/tests/test_plugins.py +126 -0
  172. docmirror-0.2.0/tests/test_quality_router.py +180 -0
  173. docmirror-0.2.0/tests/test_semantic_closure_direct.py +47 -0
  174. docmirror-0.2.0/tests/test_server.py +45 -0
  175. docmirror-0.2.0/tests/test_settings.py +54 -0
  176. docmirror-0.2.0/tests/unit/__init__.py +7 -0
  177. docmirror-0.2.0/tests/unit/adapters/__init__.py +7 -0
  178. docmirror-0.2.0/tests/unit/adapters/test_word_doc.py +27 -0
  179. docmirror-0.2.0/tests/unit/layout/__init__.py +7 -0
  180. docmirror-0.2.0/tests/unit/layout/test_layout_parallel.py +89 -0
  181. docmirror-0.2.0/tests/unit/middlewares/__init__.py +7 -0
  182. docmirror-0.2.0/tests/unit/middlewares/test_validator.py +47 -0
  183. docmirror-0.2.0/tests/unit/table/__init__.py +7 -0
  184. docmirror-0.2.0/tests/unit/table/test_classifier.py +52 -0
  185. docmirror-0.2.0/tests/unit/test_errors.py +46 -0
@@ -0,0 +1,43 @@
1
+ # Git
2
+ .git
3
+ .gitignore
4
+
5
+ # Python caches
6
+ __pycache__
7
+ *.pyc
8
+ *.pyo
9
+ *.egg-info
10
+ dist
11
+ build
12
+ .eggs
13
+
14
+ # IDE
15
+ .vscode
16
+ .idea
17
+ *.swp
18
+ *.swo
19
+
20
+ # Testing & linting caches
21
+ .pytest_cache
22
+ .ruff_cache
23
+ .mypy_cache
24
+ htmlcov
25
+
26
+ # Output artifacts & model caches
27
+ output/
28
+ model_cache/
29
+ *.onnx
30
+
31
+ # Docker
32
+ Dockerfile
33
+ docker-compose.yml
34
+
35
+ # Documentation
36
+ docs/
37
+ *.md
38
+ !README.md
39
+
40
+ # Test fixtures (large binaries)
41
+ tests/*.pdf
42
+ tests/*.png
43
+ tests/*.jpg
@@ -0,0 +1,9 @@
1
+ # DocMirror — 示例环境变量(复制为 .env 后按需修改,勿提交 .env)
2
+ # 使用方式: source .env 或 export $(grep -v '^#' .env | xargs)
3
+
4
+ # ── 外部 OCR(低质量图片/扫描件走 外部OCR模型)──
5
+ # 启用后,当图像质量分 < 阈值 时自动调用外部接口,否则用内置 RapidOCR
6
+ DOCMIRROR_EXTERNAL_OCR_PROVIDER=docmirror.core.ocr.aistudio_provider:call_aistudio_layout_ocr
7
+ DOCMIRROR_EXTERNAL_OCR_QUALITY_THRESHOLD=80
8
+ DOCMIRROR_AISTUDIO_OCR_API_URL=https://XXXX.com/layout-parsing
9
+ DOCMIRROR_AISTUDIO_OCR_TOKEN=your-token-here
@@ -0,0 +1,69 @@
1
+ name: Bug Report
2
+ description: Create a report to help us improve DocMirror parsing and robustness
3
+ title: "[Bug]: "
4
+ labels: ["bug", "triage"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ Thanks for taking the time to fill out this bug report!
10
+ - type: input
11
+ id: version
12
+ attributes:
13
+ label: DocMirror Version
14
+ description: What version of DocMirror are you using? (e.g., 0.1.0 or main branch)
15
+ placeholder: "0.1.0"
16
+ validations:
17
+ required: true
18
+ - type: dropdown
19
+ id: environment
20
+ attributes:
21
+ label: Environment
22
+ description: What environment are you running DocMirror in?
23
+ options:
24
+ - macOS
25
+ - Linux (Ubuntu/Debian)
26
+ - Linux (CentOS/RHEL)
27
+ - Windows
28
+ - Docker
29
+ validations:
30
+ required: true
31
+ - type: textarea
32
+ id: description
33
+ attributes:
34
+ label: Description
35
+ description: A clear and concise description of what the bug is.
36
+ placeholder: "When parsing a PDF with X characteristics, the output completely skips table Y..."
37
+ validations:
38
+ required: true
39
+ - type: textarea
40
+ id: reproduction
41
+ attributes:
42
+ label: Reproduction Steps
43
+ description: How can we reproduce this? Please provide code snippets or CLI commands.
44
+ placeholder: |
45
+ 1. Download this sample PDF: [Link]
46
+ 2. Run `docmirror parse sample.pdf`
47
+ 3. See error
48
+ validations:
49
+ required: true
50
+ - type: textarea
51
+ id: expected
52
+ attributes:
53
+ label: Expected Behavior
54
+ description: What did you expect to happen?
55
+ validations:
56
+ required: true
57
+ - type: textarea
58
+ id: logs
59
+ attributes:
60
+ label: Error Logs / Traceback
61
+ description: Paste any error logs, tracebacks, or terminal output here.
62
+ render: shell
63
+ - type: checkboxes
64
+ id: terms
65
+ attributes:
66
+ label: Code of Conduct
67
+ options:
68
+ - label: I agree to follow this project's Code of Conduct
69
+ required: true
@@ -0,0 +1,33 @@
1
+ name: Feature Request
2
+ description: Suggest an idea for DocMirror (e.g., new formats, new AI models)
3
+ title: "[Feature]: "
4
+ labels: ["enhancement"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ We love new ideas! Please provide as much context as possible.
10
+ - type: textarea
11
+ id: problem
12
+ attributes:
13
+ label: Is your feature request related to a problem?
14
+ description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
15
+ validations:
16
+ required: true
17
+ - type: textarea
18
+ id: solution
19
+ attributes:
20
+ label: Describe the solution you'd like
21
+ description: A clear and concise description of what you want to happen. Do you have a specific AI model or library in mind?
22
+ validations:
23
+ required: true
24
+ - type: textarea
25
+ id: alternatives
26
+ attributes:
27
+ label: Describe alternatives you've considered
28
+ description: A clear and concise description of any alternative solutions or features you've considered.
29
+ - type: textarea
30
+ id: context
31
+ attributes:
32
+ label: Additional Context
33
+ description: Add any other context or screenshots about the feature request here.
@@ -0,0 +1,23 @@
1
+ ## Description
2
+
3
+ Brief description of the changes.
4
+
5
+ ## Type of Change
6
+
7
+ - [ ] Bug fix (non-breaking change which fixes an issue)
8
+ - [ ] New feature (non-breaking change which adds functionality)
9
+ - [ ] Breaking change (fix or feature that would cause existing functionality to change)
10
+ - [ ] Documentation update
11
+
12
+ ## Checklist
13
+
14
+ - [ ] My code follows the project's coding standards
15
+ - [ ] I have added/updated tests for my changes
16
+ - [ ] All new and existing tests pass (`pytest tests/ -v`)
17
+ - [ ] Linting passes (`ruff check docmirror/`)
18
+ - [ ] I have updated the documentation (if applicable)
19
+ - [ ] I have added an entry to `CHANGELOG.md` (if applicable)
20
+
21
+ ## Related Issues
22
+
23
+ Closes #
@@ -0,0 +1,79 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ lint:
14
+ name: Lint
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+ - name: Install ruff
22
+ run: pip install ruff
23
+ - name: Run ruff check
24
+ run: ruff check docmirror/
25
+ - name: Run ruff format check
26
+ run: ruff format --check docmirror/
27
+
28
+ test:
29
+ name: Test (Python ${{ matrix.python-version }}, ${{ matrix.os }})
30
+ runs-on: ${{ matrix.os }}
31
+ strategy:
32
+ fail-fast: false
33
+ matrix:
34
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
35
+ os: [ubuntu-latest]
36
+ include:
37
+ - python-version: "3.12"
38
+ os: macos-latest
39
+ - python-version: "3.12"
40
+ os: windows-latest
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - uses: actions/setup-python@v5
44
+ with:
45
+ python-version: ${{ matrix.python-version }}
46
+ - name: Install package with dev dependencies
47
+ run: pip install -e ".[dev]"
48
+ - name: Run tests
49
+ run: pytest tests/ -v --tb=short
50
+ - name: Run tests with coverage
51
+ if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
52
+ run: |
53
+ pip install pytest-cov
54
+ pytest tests/ -v --cov=docmirror --cov-report=xml --cov-report=term-missing
55
+ - name: Upload coverage to Codecov
56
+ if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
57
+ uses: codecov/codecov-action@v4
58
+ with:
59
+ file: coverage.xml
60
+ fail_ci_if_error: false
61
+ env:
62
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
63
+
64
+ build:
65
+ name: Build package
66
+ runs-on: ubuntu-latest
67
+ steps:
68
+ - uses: actions/checkout@v4
69
+ - uses: actions/setup-python@v5
70
+ with:
71
+ python-version: "3.12"
72
+ - name: Install build tools
73
+ run: pip install build
74
+ - name: Build sdist and wheel
75
+ run: python -m build
76
+ - name: Verify package
77
+ run: |
78
+ pip install dist/*.whl
79
+ python -c "import docmirror; print(f'✅ docmirror {docmirror.__version__} imports successfully')"
@@ -0,0 +1,29 @@
1
+ name: Deploy Docs
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - "docs/**"
8
+ - "mkdocs.yml"
9
+ - "docmirror/**"
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: write
14
+
15
+ jobs:
16
+ deploy:
17
+ name: Deploy to GitHub Pages
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.12"
24
+ - name: Install dependencies
25
+ run: |
26
+ pip install mkdocs-material mkdocstrings[python] pymdown-extensions
27
+ pip install -e ".[dev]"
28
+ - name: Deploy to GitHub Pages
29
+ run: mkdocs gh-deploy --force
@@ -0,0 +1,35 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ publish:
12
+ name: Build and publish to PyPI
13
+ runs-on: ubuntu-latest
14
+ environment: pypi
15
+ permissions:
16
+ id-token: write # Required for trusted publishing
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+
23
+ - name: Install build tools
24
+ run: pip install build
25
+
26
+ - name: Build sdist and wheel
27
+ run: python -m build
28
+
29
+ - name: Verify package
30
+ run: |
31
+ pip install dist/*.whl
32
+ python -c "import docmirror; print(f'✅ docmirror installed')"
33
+
34
+ - name: Publish to PyPI
35
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,60 @@
1
+ ## General
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ .eggs/
11
+
12
+ ## Environment
13
+ .env
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ ## IDE
19
+ .vscode/
20
+ .idea/
21
+ *.swp
22
+ *.swo
23
+ *~
24
+
25
+ ## OS
26
+ .DS_Store
27
+ Thumbs.db
28
+
29
+ ## Model weights & large files
30
+ *.onnx
31
+ *.pt
32
+ *.pth
33
+ *.bin
34
+ *.safetensors
35
+ models_cache/
36
+
37
+ ## Test artifacts
38
+ *.pdf
39
+ !tests/fixtures/*.pdf
40
+ htmlcov/
41
+ .coverage
42
+ .coverage.*
43
+ coverage.xml
44
+
45
+ ## Parse output
46
+ output/
47
+
48
+ ## Distribution
49
+ *.whl
50
+ *.tar.gz
51
+
52
+ ## Cache
53
+ .mypy_cache/
54
+
55
+ ## Test fixtures (real documents)
56
+ tests/fixtures/银行流水/
57
+ tests/fixtures/*.zip
58
+ tests/fixtures/*.jpg
59
+ tests/fixtures/*.png
60
+ tests/golden/
@@ -0,0 +1,21 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+
10
+ - repo: https://github.com/astral-sh/ruff-pre-commit
11
+ rev: v0.3.3
12
+ hooks:
13
+ - id: ruff
14
+ args: [ --fix ]
15
+ - id: ruff-format
16
+
17
+ - repo: https://github.com/pre-commit/mirrors-mypy
18
+ rev: v1.8.0
19
+ hooks:
20
+ - id: mypy
21
+ additional_dependencies: [pydantic>=2.0]
@@ -0,0 +1,13 @@
1
+ # Authors and Contributors
2
+
3
+ The **DocMirror** project was originally created and is primarily maintained by **ValueMap Global**.
4
+
5
+ We are incredibly grateful to the developers, researchers, and community members who have contributed their time and expertise to make this open-source document parsing engine world-class.
6
+
7
+ ## Core Maintainer & Creator
8
+ - **Adam Lin** <adamlin@valuemapglobal.com> - (Lead Architect & Core Engine Developer)
9
+
10
+ ## Contributors
11
+ *(Alphabetical order. Please add your name here if you submit a significant PR!)*
12
+
13
+ - **Antigravity** (Google DeepMind) - Extensive structural reform, traceability optimization, and architectural purification in Phase 1-15 (March 2026).
@@ -0,0 +1,25 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-03-11
9
+
10
+ ### Added
11
+ - Initial open-source release of DocMirror
12
+ - 8 format adapters: PDF, Image, Word, Excel, PowerPoint, Email, Web, Structured
13
+ - Core extraction engine with PyMuPDF and pdfplumber backends
14
+ - OCR support via RapidOCR (ONNX Runtime)
15
+ - Layout analysis with DocLayout-YOLO and rule-based fallback
16
+ - Multi-strategy table extraction (character-based, PDFPlumber, RapidTable, VLM)
17
+ - Formula recognition via LaTeX-OCR
18
+ - PDF forgery & tamper detection (ELA + metadata analysis)
19
+ - VLM integration via Ollama HTTP API
20
+ - Middleware pipeline: SceneDetector, EntityExtractor, ColumnMapper, Validator, Repairer
21
+ - Redis-based parse result caching
22
+ - `pyproject.toml` with modular optional dependencies
23
+ - Test suite (28 tests) with pytest
24
+ - GitHub Actions CI/CD (lint, test on Python 3.10-3.13, build)
25
+ - Apache 2.0 license
@@ -0,0 +1,54 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, caste, color, religion, or sexual
10
+ identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to a positive environment:
15
+
16
+ * Using welcoming and inclusive language
17
+ * Being respectful of differing viewpoints and experiences
18
+ * Gracefully accepting constructive criticism
19
+ * Focusing on what is best for the community
20
+ * Showing empathy towards other community members
21
+
22
+ Examples of unacceptable behavior:
23
+
24
+ * The use of sexualized language or imagery, and sexual attention or advances of any kind
25
+ * Trolling, insulting or derogatory comments, and personal or political attacks
26
+ * Public or private harassment
27
+ * Publishing others' private information without explicit permission
28
+ * Other conduct which could reasonably be considered inappropriate in a professional setting
29
+
30
+ ## Enforcement Responsibilities
31
+
32
+ Community leaders are responsible for clarifying and enforcing our standards of
33
+ acceptable behavior and will take appropriate and fair corrective action in
34
+ response to any behavior that they deem inappropriate, threatening, offensive,
35
+ or harmful.
36
+
37
+ ## Scope
38
+
39
+ This Code of Conduct applies within all community spaces, and also applies when
40
+ an individual is officially representing the community in public spaces.
41
+
42
+ ## Enforcement
43
+
44
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
45
+ reported to the community leaders responsible for enforcement at
46
+ **docmirror-conduct@googlegroups.com**.
47
+
48
+ All complaints will be reviewed and investigated promptly and fairly.
49
+
50
+ ## Attribution
51
+
52
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
53
+ version 2.1, available at
54
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
@@ -0,0 +1,114 @@
1
+ # Contributing to DocMirror
2
+
3
+ Thank you for your interest in contributing to DocMirror! This guide will help you get started.
4
+
5
+ ## Development Setup
6
+
7
+ ```bash
8
+ # Clone the repository
9
+ git clone https://github.com/valuemapglobal/docmirror.git
10
+ cd docmirror
11
+
12
+ # Create a virtual environment
13
+ python -m venv .venv
14
+ source .venv/bin/activate # Linux/macOS
15
+ # .venv\Scripts\activate # Windows
16
+
17
+ # Install with dev dependencies
18
+ pip install -e ".[dev,all]"
19
+
20
+ # Verify setup
21
+ pytest tests/ -v
22
+ ```
23
+
24
+ ## Development Workflow
25
+
26
+ 1. **Fork** the repository and create a feature branch:
27
+ ```bash
28
+ git checkout -b feat/your-feature-name
29
+ ```
30
+
31
+ 2. **Make changes** — follow the coding standards below.
32
+
33
+ 3. **Run checks** before committing:
34
+ ```bash
35
+ ruff check docmirror/ # Lint
36
+ ruff format docmirror/ # Format
37
+ pytest tests/ -v # Test (110 cases)
38
+ ```
39
+
40
+ 4. **Commit** with [Conventional Commits](https://www.conventionalcommits.org/):
41
+ ```
42
+ feat: add new PDF table extraction strategy
43
+ fix: correct column alignment for merged cells
44
+ docs: update README with new configuration options
45
+ chore: update dependencies
46
+ ```
47
+
48
+ 5. **Submit a Pull Request** against `main`.
49
+
50
+ ## Coding Standards
51
+
52
+ - **Python 3.10+** — use modern syntax (`match/case`, `X | Y` unions)
53
+ - **Type hints** on all public functions
54
+ - **English** for all comments, docstrings, and variable names
55
+ - **Docstrings** in Google style for public API
56
+ - **Line length** — 120 characters max (enforced by ruff)
57
+
58
+ ## Project Structure
59
+
60
+ ```
61
+ docmirror/
62
+ ├── adapters/ # Format-specific adapters (PDF, Image, Office, ...)
63
+ │ ├── pdf/ # PDF adapter with multi-strategy extraction
64
+ │ ├── image/ # Image adapter with OCR
65
+ │ ├── office/ # Word, Excel, PowerPoint adapters
66
+ │ ├── web/ # HTML and Email adapters
67
+ │ └── data/ # Structured data (JSON, XML, CSV)
68
+ ├── configs/ # Settings, pipeline registry, domain registry
69
+ ├── core/ # Core engines
70
+ │ ├── extraction/ # Extraction, pre-analysis, quality routing
71
+ │ ├── layout/ # Layout analysis (DocLayout-YOLO, graph router)
72
+ │ ├── ocr/ # RapidOCR, formula recognition, seal detection
73
+ │ ├── table/ # Multi-strategy table extraction
74
+ │ ├── security/ # Forgery detection
75
+ │ └── output/ # Markdown export, visualization
76
+ ├── framework/ # Dispatcher, orchestrator, cache, base classes
77
+ ├── middlewares/ # Pipeline middlewares
78
+ │ ├── detection/ # Scene, institution, language detection
79
+ │ ├── extraction/ # Entity extraction
80
+ │ ├── alignment/ # Header alignment, amount splitting
81
+ │ └── validation/ # Trust scoring, mutation analysis
82
+ ├── models/ # Data models
83
+ │ ├── entities/ # PerceptionResult, EnhancedResult, domain models
84
+ │ ├── construction/ # Builder pattern for result assembly
85
+ │ └── tracking/ # Mutation tracking
86
+ ├── plugins/ # Domain plugins (bank_statement, ...)
87
+ └── server/ # FastAPI server
88
+ ```
89
+
90
+ ## Adding a New Adapter
91
+
92
+ 1. Create `docmirror/adapters/your_format/your_format.py`
93
+ 2. Subclass `BaseParser` from `docmirror.framework.base`
94
+ 3. Implement `to_base_result(file_path) -> BaseResult`
95
+ 4. Register in the dispatcher's `_get_parser()` method
96
+ 5. Add tests in `tests/test_your_format.py`
97
+
98
+ ## Adding a New Middleware
99
+
100
+ 1. Create `docmirror/middlewares/your_category/your_middleware.py`
101
+ 2. Subclass `BaseMiddleware` from `docmirror.middlewares.base`
102
+ 3. Implement `process(result: EnhancedResult) -> EnhancedResult`
103
+ 4. Register in `docmirror/configs/pipeline_registry.py`
104
+ 5. Add tests
105
+
106
+ ## Reporting Issues
107
+
108
+ - Use [GitHub Issues](https://github.com/valuemapglobal/docmirror/issues)
109
+ - Include: Python version, OS, DocMirror version, minimal reproduction steps
110
+ - For document parsing issues, include a sample file if possible (redact sensitive data)
111
+
112
+ ## License
113
+
114
+ By contributing, you agree that your contributions will be licensed under the [Apache License 2.0](LICENSE).
@@ -0,0 +1,56 @@
1
+ # Stage 1: Build & Dependencies
2
+ FROM python:3.10-slim AS builder
3
+
4
+ # Install system build dependencies required for compiling Python packages and OpenCV/ONNX
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ build-essential \
7
+ libgl1 \
8
+ libglib2.0-0 \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ WORKDIR /app
12
+
13
+ # Copy dependency files AND source code (required for editable/local wheel builds)
14
+ COPY pyproject.toml README.md ./
15
+ COPY docmirror/ docmirror/
16
+
17
+ # Force pip to build wheels for the heavy dependencies locally (if needed)
18
+ RUN pip install --upgrade pip
19
+ RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels \
20
+ ".[all,server]"
21
+
22
+ # Stage 2: Runtime Environment
23
+ FROM python:3.10-slim
24
+
25
+ # Install runtime C++ libraries required by OpenCV (rapidocr) and ONNX Runtime
26
+ # Also install curl for healthcheck
27
+ RUN apt-get update && apt-get install -y --no-install-recommends \
28
+ libgl1 \
29
+ libglib2.0-0 \
30
+ libgomp1 \
31
+ curl \
32
+ && rm -rf /var/lib/apt/lists/*
33
+
34
+ WORKDIR /app
35
+
36
+ # Copy the pre-built wheels from the builder stage
37
+ COPY --from=builder /app/wheels /wheels
38
+
39
+ # Copy the actual application source code
40
+ COPY . .
41
+
42
+ # Install the application and all its heavy dependencies (PDF, OCR, Layout, Table, Server)
43
+ RUN pip install --no-cache-dir --find-links=/wheels ".[all,server]"
44
+
45
+ # Expose the FastAPI default port
46
+ EXPOSE 8000
47
+
48
+ # Optional: define volume for model caching (speed up rapidocr models download)
49
+ VOLUME ["/root/.cache"]
50
+
51
+ # Healthcheck for container orchestration
52
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
53
+ CMD curl -f http://localhost:8000/health || exit 1
54
+
55
+ # Run the Uvicorn ASGI server
56
+ CMD ["uvicorn", "docmirror.server.api:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]