docmirror 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docmirror-0.2.0/.dockerignore +43 -0
- docmirror-0.2.0/.env.example +9 -0
- docmirror-0.2.0/.github/ISSUE_TEMPLATE/bug_report.yml +69 -0
- docmirror-0.2.0/.github/ISSUE_TEMPLATE/feature_request.yml +33 -0
- docmirror-0.2.0/.github/pull_request_template.md +23 -0
- docmirror-0.2.0/.github/workflows/ci.yml +79 -0
- docmirror-0.2.0/.github/workflows/docs.yml +29 -0
- docmirror-0.2.0/.github/workflows/publish.yml +35 -0
- docmirror-0.2.0/.gitignore +60 -0
- docmirror-0.2.0/.pre-commit-config.yaml +21 -0
- docmirror-0.2.0/AUTHORS.md +13 -0
- docmirror-0.2.0/CHANGELOG.md +25 -0
- docmirror-0.2.0/CODE_OF_CONDUCT.md +54 -0
- docmirror-0.2.0/CONTRIBUTING.md +114 -0
- docmirror-0.2.0/Dockerfile +56 -0
- docmirror-0.2.0/LICENSE +201 -0
- docmirror-0.2.0/Makefile +36 -0
- docmirror-0.2.0/PKG-INFO +202 -0
- docmirror-0.2.0/README.md +113 -0
- docmirror-0.2.0/SECURITY.md +52 -0
- docmirror-0.2.0/docker-compose.yml +34 -0
- docmirror-0.2.0/docmirror/__init__.py +64 -0
- docmirror-0.2.0/docmirror/__main__.py +254 -0
- docmirror-0.2.0/docmirror/adapters/__init__.py +47 -0
- docmirror-0.2.0/docmirror/adapters/data/__init__.py +6 -0
- docmirror-0.2.0/docmirror/adapters/data/structured.py +80 -0
- docmirror-0.2.0/docmirror/adapters/image/__init__.py +6 -0
- docmirror-0.2.0/docmirror/adapters/image/image.py +134 -0
- docmirror-0.2.0/docmirror/adapters/office/__init__.py +6 -0
- docmirror-0.2.0/docmirror/adapters/office/excel.py +113 -0
- docmirror-0.2.0/docmirror/adapters/office/omml_extractor.py +111 -0
- docmirror-0.2.0/docmirror/adapters/office/ppt.py +107 -0
- docmirror-0.2.0/docmirror/adapters/office/word.py +157 -0
- docmirror-0.2.0/docmirror/adapters/pdf/__init__.py +6 -0
- docmirror-0.2.0/docmirror/adapters/pdf/pdf.py +126 -0
- docmirror-0.2.0/docmirror/adapters/web/__init__.py +6 -0
- docmirror-0.2.0/docmirror/adapters/web/email.py +115 -0
- docmirror-0.2.0/docmirror/adapters/web/web.py +113 -0
- docmirror-0.2.0/docmirror/configs/__init__.py +18 -0
- docmirror-0.2.0/docmirror/configs/column_aliases.yaml +178 -0
- docmirror-0.2.0/docmirror/configs/domain_registry.py +206 -0
- docmirror-0.2.0/docmirror/configs/hints.yaml +99 -0
- docmirror-0.2.0/docmirror/configs/institution_registry.yaml +164 -0
- docmirror-0.2.0/docmirror/configs/key_synonyms.yaml +95 -0
- docmirror-0.2.0/docmirror/configs/pipeline_registry.py +108 -0
- docmirror-0.2.0/docmirror/configs/settings.py +229 -0
- docmirror-0.2.0/docmirror/core/__init__.py +14 -0
- docmirror-0.2.0/docmirror/core/exceptions.py +131 -0
- docmirror-0.2.0/docmirror/core/extraction/__init__.py +16 -0
- docmirror-0.2.0/docmirror/core/extraction/entity_collector.py +31 -0
- docmirror-0.2.0/docmirror/core/extraction/extractor.py +2002 -0
- docmirror-0.2.0/docmirror/core/extraction/foundation.py +126 -0
- docmirror-0.2.0/docmirror/core/extraction/html_utils.py +57 -0
- docmirror-0.2.0/docmirror/core/extraction/image_converter.py +48 -0
- docmirror-0.2.0/docmirror/core/extraction/pre_analyzer.py +618 -0
- docmirror-0.2.0/docmirror/core/extraction/quality_router.py +228 -0
- docmirror-0.2.0/docmirror/core/extraction/table_postprocessor.py +97 -0
- docmirror-0.2.0/docmirror/core/factory.py +57 -0
- docmirror-0.2.0/docmirror/core/layout/__init__.py +7 -0
- docmirror-0.2.0/docmirror/core/layout/graph_router.py +421 -0
- docmirror-0.2.0/docmirror/core/layout/layout_analysis.py +1437 -0
- docmirror-0.2.0/docmirror/core/layout/layout_model.py +197 -0
- docmirror-0.2.0/docmirror/core/layout/spatial_graph.py +304 -0
- docmirror-0.2.0/docmirror/core/ocr/__init__.py +8 -0
- docmirror-0.2.0/docmirror/core/ocr/aistudio_provider.py +146 -0
- docmirror-0.2.0/docmirror/core/ocr/fallback.py +1791 -0
- docmirror-0.2.0/docmirror/core/ocr/formula_chars.py +261 -0
- docmirror-0.2.0/docmirror/core/ocr/formula_engine.py +350 -0
- docmirror-0.2.0/docmirror/core/ocr/image_preprocessing.py +369 -0
- docmirror-0.2.0/docmirror/core/ocr/ocr_postprocess.py +367 -0
- docmirror-0.2.0/docmirror/core/ocr/table_reconstruction.py +335 -0
- docmirror-0.2.0/docmirror/core/ocr/vision/__init__.py +7 -0
- docmirror-0.2.0/docmirror/core/ocr/vision/rapidocr_engine.py +340 -0
- docmirror-0.2.0/docmirror/core/ocr/vision/seal_detector.py +252 -0
- docmirror-0.2.0/docmirror/core/security/__init__.py +6 -0
- docmirror-0.2.0/docmirror/core/security/forgery_detector.py +184 -0
- docmirror-0.2.0/docmirror/core/table/__init__.py +8 -0
- docmirror-0.2.0/docmirror/core/table/extraction/__init__.py +60 -0
- docmirror-0.2.0/docmirror/core/table/extraction/char_strategy.py +835 -0
- docmirror-0.2.0/docmirror/core/table/extraction/classifier.py +225 -0
- docmirror-0.2.0/docmirror/core/table/extraction/engine.py +856 -0
- docmirror-0.2.0/docmirror/core/table/extraction/grid_tensor.py +94 -0
- docmirror-0.2.0/docmirror/core/table/extraction/pdfplumber_strategy.py +170 -0
- docmirror-0.2.0/docmirror/core/table/extraction/pipe_strategy.py +234 -0
- docmirror-0.2.0/docmirror/core/table/extraction/rapid_table_engine.py +97 -0
- docmirror-0.2.0/docmirror/core/table/extraction/signal_processor.py +413 -0
- docmirror-0.2.0/docmirror/core/table/extraction/template_injector.py +184 -0
- docmirror-0.2.0/docmirror/core/table/extraction/utils.py +231 -0
- docmirror-0.2.0/docmirror/core/table/merger.py +181 -0
- docmirror-0.2.0/docmirror/core/table/page_state.py +109 -0
- docmirror-0.2.0/docmirror/core/table/postprocess.py +744 -0
- docmirror-0.2.0/docmirror/core/table/table_structure_fix.py +697 -0
- docmirror-0.2.0/docmirror/core/utils/__init__.py +8 -0
- docmirror-0.2.0/docmirror/core/utils/text_utils.py +160 -0
- docmirror-0.2.0/docmirror/core/utils/vocabulary.py +379 -0
- docmirror-0.2.0/docmirror/core/utils/watermark.py +238 -0
- docmirror-0.2.0/docmirror/framework/__init__.py +25 -0
- docmirror-0.2.0/docmirror/framework/base.py +350 -0
- docmirror-0.2.0/docmirror/framework/cache.py +139 -0
- docmirror-0.2.0/docmirror/framework/dispatcher.py +351 -0
- docmirror-0.2.0/docmirror/framework/orchestrator.py +221 -0
- docmirror-0.2.0/docmirror/middlewares/__init__.py +25 -0
- docmirror-0.2.0/docmirror/middlewares/alignment/__init__.py +15 -0
- docmirror-0.2.0/docmirror/middlewares/alignment/amount_splitter.py +179 -0
- docmirror-0.2.0/docmirror/middlewares/alignment/header_alignment.py +209 -0
- docmirror-0.2.0/docmirror/middlewares/base.py +346 -0
- docmirror-0.2.0/docmirror/middlewares/detection/__init__.py +13 -0
- docmirror-0.2.0/docmirror/middlewares/detection/institution_detector.py +169 -0
- docmirror-0.2.0/docmirror/middlewares/detection/language_detector.py +57 -0
- docmirror-0.2.0/docmirror/middlewares/detection/scene_detector.py +308 -0
- docmirror-0.2.0/docmirror/middlewares/extraction/__init__.py +12 -0
- docmirror-0.2.0/docmirror/middlewares/extraction/entity_extractor.py +226 -0
- docmirror-0.2.0/docmirror/middlewares/extraction/generic_entity_extractor.py +44 -0
- docmirror-0.2.0/docmirror/middlewares/validation/__init__.py +12 -0
- docmirror-0.2.0/docmirror/middlewares/validation/mutation_analyzer.py +234 -0
- docmirror-0.2.0/docmirror/middlewares/validation/validator.py +488 -0
- docmirror-0.2.0/docmirror/models/__init__.py +25 -0
- docmirror-0.2.0/docmirror/models/construction/__init__.py +11 -0
- docmirror-0.2.0/docmirror/models/construction/_shared.py +46 -0
- docmirror-0.2.0/docmirror/models/construction/builder.py +341 -0
- docmirror-0.2.0/docmirror/models/entities/__init__.py +18 -0
- docmirror-0.2.0/docmirror/models/entities/document_types.py +126 -0
- docmirror-0.2.0/docmirror/models/entities/domain.py +149 -0
- docmirror-0.2.0/docmirror/models/entities/domain_models.py +214 -0
- docmirror-0.2.0/docmirror/models/entities/enhanced.py +271 -0
- docmirror-0.2.0/docmirror/models/entities/perception_result.py +382 -0
- docmirror-0.2.0/docmirror/models/errors.py +103 -0
- docmirror-0.2.0/docmirror/models/tracking/__init__.py +11 -0
- docmirror-0.2.0/docmirror/models/tracking/mutation.py +93 -0
- docmirror-0.2.0/docmirror/plugins/__init__.py +210 -0
- docmirror-0.2.0/docmirror/plugins/bank_statement.py +104 -0
- docmirror-0.2.0/docmirror/py.typed +0 -0
- docmirror-0.2.0/docmirror/server/__init__.py +6 -0
- docmirror-0.2.0/docmirror/server/api.py +141 -0
- docmirror-0.2.0/docmirror/server/schemas.py +44 -0
- docmirror-0.2.0/docs/api/index.md +24 -0
- docmirror-0.2.0/docs/changelog.md +50 -0
- docmirror-0.2.0/docs/contributing.md +114 -0
- docmirror-0.2.0/docs/design/page-concurrency-implementation.md +138 -0
- docmirror-0.2.0/docs/design/performance-optimization.md +104 -0
- docmirror-0.2.0/docs/design/solution-design.md +378 -0
- docmirror-0.2.0/docs/development/testing.md +35 -0
- docmirror-0.2.0/docs/getting-started/installation.md +55 -0
- docmirror-0.2.0/docs/getting-started/quickstart.md +105 -0
- docmirror-0.2.0/docs/guide/architecture.md +84 -0
- docmirror-0.2.0/docs/guide/configuration.md +95 -0
- docmirror-0.2.0/docs/guide/error-handling.md +38 -0
- docmirror-0.2.0/docs/guide/external-ocr.md +142 -0
- docmirror-0.2.0/docs/guide/formats.md +15 -0
- docmirror-0.2.0/docs/index.md +5 -0
- docmirror-0.2.0/docs/integrations.md +48 -0
- docmirror-0.2.0/docs/plg-strategies.md +365 -0
- docmirror-0.2.0/docs/plugins/creating-plugins.md +70 -0
- docmirror-0.2.0/docs/plugins/overview.md +50 -0
- docmirror-0.2.0/examples/promo_demo.py +143 -0
- docmirror-0.2.0/mkdocs.yml +93 -0
- docmirror-0.2.0/pyproject.toml +118 -0
- docmirror-0.2.0/tests/README.md +35 -0
- docmirror-0.2.0/tests/__init__.py +7 -0
- docmirror-0.2.0/tests/benchmark/__init__.py +7 -0
- docmirror-0.2.0/tests/benchmark/hybrid_matcher.py +222 -0
- docmirror-0.2.0/tests/benchmark/metrics.py +183 -0
- docmirror-0.2.0/tests/conftest.py +33 -0
- docmirror-0.2.0/tests/fixtures/.gitkeep +0 -0
- docmirror-0.2.0/tests/profile_parse.py +131 -0
- docmirror-0.2.0/tests/test_domain_registry.py +140 -0
- docmirror-0.2.0/tests/test_imports.py +102 -0
- docmirror-0.2.0/tests/test_integration.py +337 -0
- docmirror-0.2.0/tests/test_models.py +66 -0
- docmirror-0.2.0/tests/test_omml_extractor.py +50 -0
- docmirror-0.2.0/tests/test_plugins.py +126 -0
- docmirror-0.2.0/tests/test_quality_router.py +180 -0
- docmirror-0.2.0/tests/test_semantic_closure_direct.py +47 -0
- docmirror-0.2.0/tests/test_server.py +45 -0
- docmirror-0.2.0/tests/test_settings.py +54 -0
- docmirror-0.2.0/tests/unit/__init__.py +7 -0
- docmirror-0.2.0/tests/unit/adapters/__init__.py +7 -0
- docmirror-0.2.0/tests/unit/adapters/test_word_doc.py +27 -0
- docmirror-0.2.0/tests/unit/layout/__init__.py +7 -0
- docmirror-0.2.0/tests/unit/layout/test_layout_parallel.py +89 -0
- docmirror-0.2.0/tests/unit/middlewares/__init__.py +7 -0
- docmirror-0.2.0/tests/unit/middlewares/test_validator.py +47 -0
- docmirror-0.2.0/tests/unit/table/__init__.py +7 -0
- docmirror-0.2.0/tests/unit/table/test_classifier.py +52 -0
- docmirror-0.2.0/tests/unit/test_errors.py +46 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Git
|
|
2
|
+
.git
|
|
3
|
+
.gitignore
|
|
4
|
+
|
|
5
|
+
# Python caches
|
|
6
|
+
__pycache__
|
|
7
|
+
*.pyc
|
|
8
|
+
*.pyo
|
|
9
|
+
*.egg-info
|
|
10
|
+
dist
|
|
11
|
+
build
|
|
12
|
+
.eggs
|
|
13
|
+
|
|
14
|
+
# IDE
|
|
15
|
+
.vscode
|
|
16
|
+
.idea
|
|
17
|
+
*.swp
|
|
18
|
+
*.swo
|
|
19
|
+
|
|
20
|
+
# Testing & linting caches
|
|
21
|
+
.pytest_cache
|
|
22
|
+
.ruff_cache
|
|
23
|
+
.mypy_cache
|
|
24
|
+
htmlcov
|
|
25
|
+
|
|
26
|
+
# Output artifacts & model caches
|
|
27
|
+
output/
|
|
28
|
+
model_cache/
|
|
29
|
+
*.onnx
|
|
30
|
+
|
|
31
|
+
# Docker
|
|
32
|
+
Dockerfile
|
|
33
|
+
docker-compose.yml
|
|
34
|
+
|
|
35
|
+
# Documentation
|
|
36
|
+
docs/
|
|
37
|
+
*.md
|
|
38
|
+
!README.md
|
|
39
|
+
|
|
40
|
+
# Test fixtures (large binaries)
|
|
41
|
+
tests/*.pdf
|
|
42
|
+
tests/*.png
|
|
43
|
+
tests/*.jpg
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# DocMirror — 示例环境变量(复制为 .env 后按需修改,勿提交 .env)
|
|
2
|
+
# 使用方式: source .env 或 export $(grep -v '^#' .env | xargs)
|
|
3
|
+
|
|
4
|
+
# ── 外部 OCR(低质量图片/扫描件走 外部OCR模型)──
|
|
5
|
+
# 启用后,当图像质量分 < 阈值 时自动调用外部接口,否则用内置 RapidOCR
|
|
6
|
+
DOCMIRROR_EXTERNAL_OCR_PROVIDER=docmirror.core.ocr.aistudio_provider:call_aistudio_layout_ocr
|
|
7
|
+
DOCMIRROR_EXTERNAL_OCR_QUALITY_THRESHOLD=80
|
|
8
|
+
DOCMIRROR_AISTUDIO_OCR_API_URL=https://XXXX.com/layout-parsing
|
|
9
|
+
DOCMIRROR_AISTUDIO_OCR_TOKEN=your-token-here
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Create a report to help us improve DocMirror parsing and robustness
|
|
3
|
+
title: "[Bug]: "
|
|
4
|
+
labels: ["bug", "triage"]
|
|
5
|
+
body:
|
|
6
|
+
- type: markdown
|
|
7
|
+
attributes:
|
|
8
|
+
value: |
|
|
9
|
+
Thanks for taking the time to fill out this bug report!
|
|
10
|
+
- type: input
|
|
11
|
+
id: version
|
|
12
|
+
attributes:
|
|
13
|
+
label: DocMirror Version
|
|
14
|
+
description: What version of DocMirror are you using? (e.g., 0.1.0 or main branch)
|
|
15
|
+
placeholder: "0.1.0"
|
|
16
|
+
validations:
|
|
17
|
+
required: true
|
|
18
|
+
- type: dropdown
|
|
19
|
+
id: environment
|
|
20
|
+
attributes:
|
|
21
|
+
label: Environment
|
|
22
|
+
description: What environment are you running DocMirror in?
|
|
23
|
+
options:
|
|
24
|
+
- macOS
|
|
25
|
+
- Linux (Ubuntu/Debian)
|
|
26
|
+
- Linux (CentOS/RHEL)
|
|
27
|
+
- Windows
|
|
28
|
+
- Docker
|
|
29
|
+
validations:
|
|
30
|
+
required: true
|
|
31
|
+
- type: textarea
|
|
32
|
+
id: description
|
|
33
|
+
attributes:
|
|
34
|
+
label: Description
|
|
35
|
+
description: A clear and concise description of what the bug is.
|
|
36
|
+
placeholder: "When parsing a PDF with X characteristics, the output completely skips table Y..."
|
|
37
|
+
validations:
|
|
38
|
+
required: true
|
|
39
|
+
- type: textarea
|
|
40
|
+
id: reproduction
|
|
41
|
+
attributes:
|
|
42
|
+
label: Reproduction Steps
|
|
43
|
+
description: How can we reproduce this? Please provide code snippets or CLI commands.
|
|
44
|
+
placeholder: |
|
|
45
|
+
1. Download this sample PDF: [Link]
|
|
46
|
+
2. Run `docmirror parse sample.pdf`
|
|
47
|
+
3. See error
|
|
48
|
+
validations:
|
|
49
|
+
required: true
|
|
50
|
+
- type: textarea
|
|
51
|
+
id: expected
|
|
52
|
+
attributes:
|
|
53
|
+
label: Expected Behavior
|
|
54
|
+
description: What did you expect to happen?
|
|
55
|
+
validations:
|
|
56
|
+
required: true
|
|
57
|
+
- type: textarea
|
|
58
|
+
id: logs
|
|
59
|
+
attributes:
|
|
60
|
+
label: Error Logs / Traceback
|
|
61
|
+
description: Paste any error logs, tracebacks, or terminal output here.
|
|
62
|
+
render: shell
|
|
63
|
+
- type: checkboxes
|
|
64
|
+
id: terms
|
|
65
|
+
attributes:
|
|
66
|
+
label: Code of Conduct
|
|
67
|
+
options:
|
|
68
|
+
- label: I agree to follow this project's Code of Conduct
|
|
69
|
+
required: true
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest an idea for DocMirror (e.g., new formats, new AI models)
|
|
3
|
+
title: "[Feature]: "
|
|
4
|
+
labels: ["enhancement"]
|
|
5
|
+
body:
|
|
6
|
+
- type: markdown
|
|
7
|
+
attributes:
|
|
8
|
+
value: |
|
|
9
|
+
We love new ideas! Please provide as much context as possible.
|
|
10
|
+
- type: textarea
|
|
11
|
+
id: problem
|
|
12
|
+
attributes:
|
|
13
|
+
label: Is your feature request related to a problem?
|
|
14
|
+
description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
|
15
|
+
validations:
|
|
16
|
+
required: true
|
|
17
|
+
- type: textarea
|
|
18
|
+
id: solution
|
|
19
|
+
attributes:
|
|
20
|
+
label: Describe the solution you'd like
|
|
21
|
+
description: A clear and concise description of what you want to happen. Do you have a specific AI model or library in mind?
|
|
22
|
+
validations:
|
|
23
|
+
required: true
|
|
24
|
+
- type: textarea
|
|
25
|
+
id: alternatives
|
|
26
|
+
attributes:
|
|
27
|
+
label: Describe alternatives you've considered
|
|
28
|
+
description: A clear and concise description of any alternative solutions or features you've considered.
|
|
29
|
+
- type: textarea
|
|
30
|
+
id: context
|
|
31
|
+
attributes:
|
|
32
|
+
label: Additional Context
|
|
33
|
+
description: Add any other context or screenshots about the feature request here.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
## Description
|
|
2
|
+
|
|
3
|
+
Brief description of the changes.
|
|
4
|
+
|
|
5
|
+
## Type of Change
|
|
6
|
+
|
|
7
|
+
- [ ] Bug fix (non-breaking change which fixes an issue)
|
|
8
|
+
- [ ] New feature (non-breaking change which adds functionality)
|
|
9
|
+
- [ ] Breaking change (fix or feature that would cause existing functionality to change)
|
|
10
|
+
- [ ] Documentation update
|
|
11
|
+
|
|
12
|
+
## Checklist
|
|
13
|
+
|
|
14
|
+
- [ ] My code follows the project's coding standards
|
|
15
|
+
- [ ] I have added/updated tests for my changes
|
|
16
|
+
- [ ] All new and existing tests pass (`pytest tests/ -v`)
|
|
17
|
+
- [ ] Linting passes (`ruff check docmirror/`)
|
|
18
|
+
- [ ] I have updated the documentation (if applicable)
|
|
19
|
+
- [ ] I have added an entry to `CHANGELOG.md` (if applicable)
|
|
20
|
+
|
|
21
|
+
## Related Issues
|
|
22
|
+
|
|
23
|
+
Closes #
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
lint:
|
|
14
|
+
name: Lint
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
- name: Install ruff
|
|
22
|
+
run: pip install ruff
|
|
23
|
+
- name: Run ruff check
|
|
24
|
+
run: ruff check docmirror/
|
|
25
|
+
- name: Run ruff format check
|
|
26
|
+
run: ruff format --check docmirror/
|
|
27
|
+
|
|
28
|
+
test:
|
|
29
|
+
name: Test (Python ${{ matrix.python-version }}, ${{ matrix.os }})
|
|
30
|
+
runs-on: ${{ matrix.os }}
|
|
31
|
+
strategy:
|
|
32
|
+
fail-fast: false
|
|
33
|
+
matrix:
|
|
34
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
35
|
+
os: [ubuntu-latest]
|
|
36
|
+
include:
|
|
37
|
+
- python-version: "3.12"
|
|
38
|
+
os: macos-latest
|
|
39
|
+
- python-version: "3.12"
|
|
40
|
+
os: windows-latest
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/checkout@v4
|
|
43
|
+
- uses: actions/setup-python@v5
|
|
44
|
+
with:
|
|
45
|
+
python-version: ${{ matrix.python-version }}
|
|
46
|
+
- name: Install package with dev dependencies
|
|
47
|
+
run: pip install -e ".[dev]"
|
|
48
|
+
- name: Run tests
|
|
49
|
+
run: pytest tests/ -v --tb=short
|
|
50
|
+
- name: Run tests with coverage
|
|
51
|
+
if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
|
|
52
|
+
run: |
|
|
53
|
+
pip install pytest-cov
|
|
54
|
+
pytest tests/ -v --cov=docmirror --cov-report=xml --cov-report=term-missing
|
|
55
|
+
- name: Upload coverage to Codecov
|
|
56
|
+
if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
|
|
57
|
+
uses: codecov/codecov-action@v4
|
|
58
|
+
with:
|
|
59
|
+
file: coverage.xml
|
|
60
|
+
fail_ci_if_error: false
|
|
61
|
+
env:
|
|
62
|
+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
63
|
+
|
|
64
|
+
build:
|
|
65
|
+
name: Build package
|
|
66
|
+
runs-on: ubuntu-latest
|
|
67
|
+
steps:
|
|
68
|
+
- uses: actions/checkout@v4
|
|
69
|
+
- uses: actions/setup-python@v5
|
|
70
|
+
with:
|
|
71
|
+
python-version: "3.12"
|
|
72
|
+
- name: Install build tools
|
|
73
|
+
run: pip install build
|
|
74
|
+
- name: Build sdist and wheel
|
|
75
|
+
run: python -m build
|
|
76
|
+
- name: Verify package
|
|
77
|
+
run: |
|
|
78
|
+
pip install dist/*.whl
|
|
79
|
+
python -c "import docmirror; print(f'✅ docmirror {docmirror.__version__} imports successfully')"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Deploy Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- "docs/**"
|
|
8
|
+
- "mkdocs.yml"
|
|
9
|
+
- "docmirror/**"
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: write
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
deploy:
|
|
17
|
+
name: Deploy to GitHub Pages
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.12"
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
pip install mkdocs-material mkdocstrings[python] pymdown-extensions
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
- name: Deploy to GitHub Pages
|
|
29
|
+
run: mkdocs gh-deploy --force
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
name: Build and publish to PyPI
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment: pypi
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write # Required for trusted publishing
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.12"
|
|
22
|
+
|
|
23
|
+
- name: Install build tools
|
|
24
|
+
run: pip install build
|
|
25
|
+
|
|
26
|
+
- name: Build sdist and wheel
|
|
27
|
+
run: python -m build
|
|
28
|
+
|
|
29
|
+
- name: Verify package
|
|
30
|
+
run: |
|
|
31
|
+
pip install dist/*.whl
|
|
32
|
+
python -c "import docmirror; print(f'✅ docmirror installed')"
|
|
33
|
+
|
|
34
|
+
- name: Publish to PyPI
|
|
35
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
## General
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
*.egg-info/
|
|
7
|
+
*.egg
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
## Environment
|
|
13
|
+
.env
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
## IDE
|
|
19
|
+
.vscode/
|
|
20
|
+
.idea/
|
|
21
|
+
*.swp
|
|
22
|
+
*.swo
|
|
23
|
+
*~
|
|
24
|
+
|
|
25
|
+
## OS
|
|
26
|
+
.DS_Store
|
|
27
|
+
Thumbs.db
|
|
28
|
+
|
|
29
|
+
## Model weights & large files
|
|
30
|
+
*.onnx
|
|
31
|
+
*.pt
|
|
32
|
+
*.pth
|
|
33
|
+
*.bin
|
|
34
|
+
*.safetensors
|
|
35
|
+
models_cache/
|
|
36
|
+
|
|
37
|
+
## Test artifacts
|
|
38
|
+
*.pdf
|
|
39
|
+
!tests/fixtures/*.pdf
|
|
40
|
+
htmlcov/
|
|
41
|
+
.coverage
|
|
42
|
+
.coverage.*
|
|
43
|
+
coverage.xml
|
|
44
|
+
|
|
45
|
+
## Parse output
|
|
46
|
+
output/
|
|
47
|
+
|
|
48
|
+
## Distribution
|
|
49
|
+
*.whl
|
|
50
|
+
*.tar.gz
|
|
51
|
+
|
|
52
|
+
## Cache
|
|
53
|
+
.mypy_cache/
|
|
54
|
+
|
|
55
|
+
## Test fixtures (real documents)
|
|
56
|
+
tests/fixtures/银行流水/
|
|
57
|
+
tests/fixtures/*.zip
|
|
58
|
+
tests/fixtures/*.jpg
|
|
59
|
+
tests/fixtures/*.png
|
|
60
|
+
tests/golden/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v4.5.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
|
|
10
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
11
|
+
rev: v0.3.3
|
|
12
|
+
hooks:
|
|
13
|
+
- id: ruff
|
|
14
|
+
args: [ --fix ]
|
|
15
|
+
- id: ruff-format
|
|
16
|
+
|
|
17
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
18
|
+
rev: v1.8.0
|
|
19
|
+
hooks:
|
|
20
|
+
- id: mypy
|
|
21
|
+
additional_dependencies: [pydantic>=2.0]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Authors and Contributors
|
|
2
|
+
|
|
3
|
+
The **DocMirror** project was originally created and is primarily maintained by **ValueMap Global**.
|
|
4
|
+
|
|
5
|
+
We are incredibly grateful to the developers, researchers, and community members who have contributed their time and expertise to make this open-source document parsing engine world-class.
|
|
6
|
+
|
|
7
|
+
## Core Maintainer & Creator
|
|
8
|
+
- **Adam Lin** <adamlin@valuemapglobal.com> - (Lead Architect & Core Engine Developer)
|
|
9
|
+
|
|
10
|
+
## Contributors
|
|
11
|
+
*(Alphabetical order. Please add your name here if you submit a significant PR!)*
|
|
12
|
+
|
|
13
|
+
- **Antigravity** (Google DeepMind) - Extensive structural reform, traceability optimization, and architectural purification in Phase 1-15 (March 2026).
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-03-11
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Initial open-source release of DocMirror
|
|
12
|
+
- 8 format adapters: PDF, Image, Word, Excel, PowerPoint, Email, Web, Structured
|
|
13
|
+
- Core extraction engine with PyMuPDF and pdfplumber backends
|
|
14
|
+
- OCR support via RapidOCR (ONNX Runtime)
|
|
15
|
+
- Layout analysis with DocLayout-YOLO and rule-based fallback
|
|
16
|
+
- Multi-strategy table extraction (character-based, PDFPlumber, RapidTable, VLM)
|
|
17
|
+
- Formula recognition via LaTeX-OCR
|
|
18
|
+
- PDF forgery & tamper detection (ELA + metadata analysis)
|
|
19
|
+
- VLM integration via Ollama HTTP API
|
|
20
|
+
- Middleware pipeline: SceneDetector, EntityExtractor, ColumnMapper, Validator, Repairer
|
|
21
|
+
- Redis-based parse result caching
|
|
22
|
+
- `pyproject.toml` with modular optional dependencies
|
|
23
|
+
- Test suite (28 tests) with pytest
|
|
24
|
+
- GitHub Actions CI/CD (lint, test on Python 3.10-3.13, build)
|
|
25
|
+
- Apache 2.0 license
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
|
2
|
+
|
|
3
|
+
## Our Pledge
|
|
4
|
+
|
|
5
|
+
We as members, contributors, and leaders pledge to make participation in our
|
|
6
|
+
community a harassment-free experience for everyone, regardless of age, body
|
|
7
|
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
|
8
|
+
identity and expression, level of experience, education, socio-economic status,
|
|
9
|
+
nationality, personal appearance, race, caste, color, religion, or sexual
|
|
10
|
+
identity and orientation.
|
|
11
|
+
|
|
12
|
+
## Our Standards
|
|
13
|
+
|
|
14
|
+
Examples of behavior that contributes to a positive environment:
|
|
15
|
+
|
|
16
|
+
* Using welcoming and inclusive language
|
|
17
|
+
* Being respectful of differing viewpoints and experiences
|
|
18
|
+
* Gracefully accepting constructive criticism
|
|
19
|
+
* Focusing on what is best for the community
|
|
20
|
+
* Showing empathy towards other community members
|
|
21
|
+
|
|
22
|
+
Examples of unacceptable behavior:
|
|
23
|
+
|
|
24
|
+
* The use of sexualized language or imagery, and sexual attention or advances of any kind
|
|
25
|
+
* Trolling, insulting or derogatory comments, and personal or political attacks
|
|
26
|
+
* Public or private harassment
|
|
27
|
+
* Publishing others' private information without explicit permission
|
|
28
|
+
* Other conduct which could reasonably be considered inappropriate in a professional setting
|
|
29
|
+
|
|
30
|
+
## Enforcement Responsibilities
|
|
31
|
+
|
|
32
|
+
Community leaders are responsible for clarifying and enforcing our standards of
|
|
33
|
+
acceptable behavior and will take appropriate and fair corrective action in
|
|
34
|
+
response to any behavior that they deem inappropriate, threatening, offensive,
|
|
35
|
+
or harmful.
|
|
36
|
+
|
|
37
|
+
## Scope
|
|
38
|
+
|
|
39
|
+
This Code of Conduct applies within all community spaces, and also applies when
|
|
40
|
+
an individual is officially representing the community in public spaces.
|
|
41
|
+
|
|
42
|
+
## Enforcement
|
|
43
|
+
|
|
44
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
|
45
|
+
reported to the community leaders responsible for enforcement at
|
|
46
|
+
**docmirror-conduct@googlegroups.com**.
|
|
47
|
+
|
|
48
|
+
All complaints will be reviewed and investigated promptly and fairly.
|
|
49
|
+
|
|
50
|
+
## Attribution
|
|
51
|
+
|
|
52
|
+
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
|
|
53
|
+
version 2.1, available at
|
|
54
|
+
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Contributing to DocMirror
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to DocMirror! This guide will help you get started.
|
|
4
|
+
|
|
5
|
+
## Development Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Clone the repository
|
|
9
|
+
git clone https://github.com/valuemapglobal/docmirror.git
|
|
10
|
+
cd docmirror
|
|
11
|
+
|
|
12
|
+
# Create a virtual environment
|
|
13
|
+
python -m venv .venv
|
|
14
|
+
source .venv/bin/activate # Linux/macOS
|
|
15
|
+
# .venv\Scripts\activate # Windows
|
|
16
|
+
|
|
17
|
+
# Install with dev dependencies
|
|
18
|
+
pip install -e ".[dev,all]"
|
|
19
|
+
|
|
20
|
+
# Verify setup
|
|
21
|
+
pytest tests/ -v
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Development Workflow
|
|
25
|
+
|
|
26
|
+
1. **Fork** the repository and create a feature branch:
|
|
27
|
+
```bash
|
|
28
|
+
git checkout -b feat/your-feature-name
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
2. **Make changes** — follow the coding standards below.
|
|
32
|
+
|
|
33
|
+
3. **Run checks** before committing:
|
|
34
|
+
```bash
|
|
35
|
+
ruff check docmirror/ # Lint
|
|
36
|
+
ruff format docmirror/ # Format
|
|
37
|
+
pytest tests/ -v # Test (110 cases)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
4. **Commit** with [Conventional Commits](https://www.conventionalcommits.org/):
|
|
41
|
+
```
|
|
42
|
+
feat: add new PDF table extraction strategy
|
|
43
|
+
fix: correct column alignment for merged cells
|
|
44
|
+
docs: update README with new configuration options
|
|
45
|
+
chore: update dependencies
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
5. **Submit a Pull Request** against `main`.
|
|
49
|
+
|
|
50
|
+
## Coding Standards
|
|
51
|
+
|
|
52
|
+
- **Python 3.10+** — use modern syntax (`match/case`, `X | Y` unions)
|
|
53
|
+
- **Type hints** on all public functions
|
|
54
|
+
- **English** for all comments, docstrings, and variable names
|
|
55
|
+
- **Docstrings** in Google style for public API
|
|
56
|
+
- **Line length** — 120 characters max (enforced by ruff)
|
|
57
|
+
|
|
58
|
+
## Project Structure
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
docmirror/
|
|
62
|
+
├── adapters/ # Format-specific adapters (PDF, Image, Office, ...)
|
|
63
|
+
│ ├── pdf/ # PDF adapter with multi-strategy extraction
|
|
64
|
+
│ ├── image/ # Image adapter with OCR
|
|
65
|
+
│ ├── office/ # Word, Excel, PowerPoint adapters
|
|
66
|
+
│ ├── web/ # HTML and Email adapters
|
|
67
|
+
│ └── data/ # Structured data (JSON, XML, CSV)
|
|
68
|
+
├── configs/ # Settings, pipeline registry, domain registry
|
|
69
|
+
├── core/ # Core engines
|
|
70
|
+
│ ├── extraction/ # Extraction, pre-analysis, quality routing
|
|
71
|
+
│ ├── layout/ # Layout analysis (DocLayout-YOLO, graph router)
|
|
72
|
+
│ ├── ocr/ # RapidOCR, formula recognition, seal detection
|
|
73
|
+
│ ├── table/ # Multi-strategy table extraction
|
|
74
|
+
│ ├── security/ # Forgery detection
|
|
75
|
+
│ └── output/ # Markdown export, visualization
|
|
76
|
+
├── framework/ # Dispatcher, orchestrator, cache, base classes
|
|
77
|
+
├── middlewares/ # Pipeline middlewares
|
|
78
|
+
│ ├── detection/ # Scene, institution, language detection
|
|
79
|
+
│ ├── extraction/ # Entity extraction
|
|
80
|
+
│ ├── alignment/ # Header alignment, amount splitting
|
|
81
|
+
│ └── validation/ # Trust scoring, mutation analysis
|
|
82
|
+
├── models/ # Data models
|
|
83
|
+
│ ├── entities/ # PerceptionResult, EnhancedResult, domain models
|
|
84
|
+
│ ├── construction/ # Builder pattern for result assembly
|
|
85
|
+
│ └── tracking/ # Mutation tracking
|
|
86
|
+
├── plugins/ # Domain plugins (bank_statement, ...)
|
|
87
|
+
└── server/ # FastAPI server
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Adding a New Adapter
|
|
91
|
+
|
|
92
|
+
1. Create `docmirror/adapters/your_format/your_format.py`
|
|
93
|
+
2. Subclass `BaseParser` from `docmirror.framework.base`
|
|
94
|
+
3. Implement `to_base_result(file_path) -> BaseResult`
|
|
95
|
+
4. Register in the dispatcher's `_get_parser()` method
|
|
96
|
+
5. Add tests in `tests/test_your_format.py`
|
|
97
|
+
|
|
98
|
+
## Adding a New Middleware
|
|
99
|
+
|
|
100
|
+
1. Create `docmirror/middlewares/your_category/your_middleware.py`
|
|
101
|
+
2. Subclass `BaseMiddleware` from `docmirror.middlewares.base`
|
|
102
|
+
3. Implement `process(result: EnhancedResult) -> EnhancedResult`
|
|
103
|
+
4. Register in `docmirror/configs/pipeline_registry.py`
|
|
104
|
+
5. Add tests
|
|
105
|
+
|
|
106
|
+
## Reporting Issues
|
|
107
|
+
|
|
108
|
+
- Use [GitHub Issues](https://github.com/valuemapglobal/docmirror/issues)
|
|
109
|
+
- Include: Python version, OS, DocMirror version, minimal reproduction steps
|
|
110
|
+
- For document parsing issues, include a sample file if possible (redact sensitive data)
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
By contributing, you agree that your contributions will be licensed under the [Apache License 2.0](LICENSE).
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Stage 1: Build & Dependencies
|
|
2
|
+
FROM python:3.10-slim AS builder
|
|
3
|
+
|
|
4
|
+
# Install system build dependencies required for compiling Python packages and OpenCV/ONNX
|
|
5
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
6
|
+
build-essential \
|
|
7
|
+
libgl1 \
|
|
8
|
+
libglib2.0-0 \
|
|
9
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
10
|
+
|
|
11
|
+
WORKDIR /app
|
|
12
|
+
|
|
13
|
+
# Copy dependency files AND source code (required for editable/local wheel builds)
|
|
14
|
+
COPY pyproject.toml README.md ./
|
|
15
|
+
COPY docmirror/ docmirror/
|
|
16
|
+
|
|
17
|
+
# Force pip to build wheels for the heavy dependencies locally (if needed)
|
|
18
|
+
RUN pip install --upgrade pip
|
|
19
|
+
RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels \
|
|
20
|
+
".[all,server]"
|
|
21
|
+
|
|
22
|
+
# Stage 2: Runtime Environment
|
|
23
|
+
FROM python:3.10-slim
|
|
24
|
+
|
|
25
|
+
# Install runtime C++ libraries required by OpenCV (rapidocr) and ONNX Runtime
|
|
26
|
+
# Also install curl for healthcheck
|
|
27
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
28
|
+
libgl1 \
|
|
29
|
+
libglib2.0-0 \
|
|
30
|
+
libgomp1 \
|
|
31
|
+
curl \
|
|
32
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
33
|
+
|
|
34
|
+
WORKDIR /app
|
|
35
|
+
|
|
36
|
+
# Copy the pre-built wheels from the builder stage
|
|
37
|
+
COPY --from=builder /app/wheels /wheels
|
|
38
|
+
|
|
39
|
+
# Copy the actual application source code
|
|
40
|
+
COPY . .
|
|
41
|
+
|
|
42
|
+
# Install the application and all its heavy dependencies (PDF, OCR, Layout, Table, Server)
|
|
43
|
+
RUN pip install --no-cache-dir --find-links=/wheels ".[all,server]"
|
|
44
|
+
|
|
45
|
+
# Expose the FastAPI default port
|
|
46
|
+
EXPOSE 8000
|
|
47
|
+
|
|
48
|
+
# Optional: define volume for model caching (speed up rapidocr models download)
|
|
49
|
+
VOLUME ["/root/.cache"]
|
|
50
|
+
|
|
51
|
+
# Healthcheck for container orchestration
|
|
52
|
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
|
53
|
+
CMD curl -f http://localhost:8000/health || exit 1
|
|
54
|
+
|
|
55
|
+
# Run the Uvicorn ASGI server
|
|
56
|
+
CMD ["uvicorn", "docmirror.server.api:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
|