extract-python 0.1.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python-0.2.1/.dockerignore +6 -0
- extract_python-0.2.1/.github/workflows/publish.yml +45 -0
- extract_python-0.2.1/.github/workflows/tests.yml +79 -0
- extract_python-0.2.1/.python-version +1 -0
- extract_python-0.2.1/Dockerfile +76 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/PKG-INFO +1 -1
- extract_python-0.2.1/benches/__init__.py +5 -0
- extract_python-0.2.1/benches/compare.ipynb +117 -0
- extract_python-0.2.1/benches/compare.py +152 -0
- extract_python-0.2.1/benches/constants.py +4 -0
- extract_python-0.2.1/data/.gitignore +2 -0
- extract_python-0.2.1/docker-compose.yml +107 -0
- extract_python-0.2.1/extract +42 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/docling_.py +25 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/marker_.py +29 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/miner_u.py +12 -1
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/objects.py +27 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/pipeline.py +5 -1
- {extract_python-0.1.0 → extract_python-0.2.1}/pyproject.toml +21 -10
- extract_python-0.2.1/qa/ruff.toml +58 -0
- extract_python-0.2.1/uv.lock +5493 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/.gitignore +0 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/README.md +0 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/__init__.py +0 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/constants.py +0 -0
- {extract_python-0.1.0 → extract_python-0.2.1}/extract_python/utils.py +0 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
name: Publish extra-python
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
tags:
|
|
7
|
+
- '*'
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
create-release:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
env:
|
|
13
|
+
PYTHON_VERSION: 3.12
|
|
14
|
+
ASTRAL_VERSION: 0.11.6
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v6
|
|
17
|
+
- name: Create GH release
|
|
18
|
+
run: gh release create "$tag" --generate-notes
|
|
19
|
+
env:
|
|
20
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
21
|
+
tag: ${{ github.ref_name }}
|
|
22
|
+
|
|
23
|
+
publish-extra-python-to-pypi:
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
permissions:
|
|
26
|
+
id-token: write
|
|
27
|
+
environment:
|
|
28
|
+
name: pypi
|
|
29
|
+
url: https://pypi.org/p/extract-python
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v6
|
|
32
|
+
- name: Install uv
|
|
33
|
+
uses: astral-sh/setup-uv@v7
|
|
34
|
+
with:
|
|
35
|
+
version: "0.10.8"
|
|
36
|
+
- name: Install Python 3.12
|
|
37
|
+
run: uv python install 3.12
|
|
38
|
+
- name: Build
|
|
39
|
+
run: uv build
|
|
40
|
+
- name: Publish
|
|
41
|
+
run: uv publish
|
|
42
|
+
|
|
43
|
+
concurrency:
|
|
44
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
45
|
+
cancel-in-progress: false
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
name: Tests extract-python
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ 'main' ]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
lint:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v6
|
|
13
|
+
- uses: astral-sh/ruff-action@v3
|
|
14
|
+
with:
|
|
15
|
+
args: "--version" # skips test by displaying the version
|
|
16
|
+
- name: Check formatting
|
|
17
|
+
run: ruff format --config qa/ruff.toml --check extract_python tests
|
|
18
|
+
- name: Lint test
|
|
19
|
+
run: ruff check --config qa/ruff.toml extract_python tests
|
|
20
|
+
|
|
21
|
+
tests-no-miner-u:
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
env:
|
|
24
|
+
PYTHON_VERSION: 3.12
|
|
25
|
+
ASTRAL_VERSION: 0.11.6
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v6
|
|
28
|
+
- name: Setup Python project
|
|
29
|
+
uses: actions/setup-python@v6
|
|
30
|
+
with:
|
|
31
|
+
python-version: ${{ env.PYTHON_VERSION }}
|
|
32
|
+
- name: Install uv
|
|
33
|
+
uses: astral-sh/setup-uv@v7
|
|
34
|
+
with:
|
|
35
|
+
version: ${{ env.ASTRAL_VERSION }}
|
|
36
|
+
python-version: ${{ env.PYTHON_VERSION }}
|
|
37
|
+
enable-cache: true
|
|
38
|
+
- name: Install tesseract
|
|
39
|
+
run: |
|
|
40
|
+
sudo apt-get update
|
|
41
|
+
sudo apt-get install -y --fix-missing tesseract-ocr \
|
|
42
|
+
tesseract-ocr-eng \
|
|
43
|
+
tesseract-ocr-fra \
|
|
44
|
+
tesseract-ocr-deu \
|
|
45
|
+
tesseract-ocr-spa \
|
|
46
|
+
tesseract-ocr-lat \
|
|
47
|
+
tesseract-ocr-jpn \
|
|
48
|
+
libtesseract-dev \
|
|
49
|
+
libleptonica-dev \
|
|
50
|
+
pkg-config
|
|
51
|
+
echo "TESSDATA_PREFIX=$(sudo dpkg -L tesseract-ocr-eng | grep tessdata$)" >> $GITHUB_ENV
|
|
52
|
+
- name: Run tests
|
|
53
|
+
run: |
|
|
54
|
+
uv run --dev --extra docling --extra marker --frozen pytest -m "not miner_u" -vvv --cache-clear --show-capture=all -r A tests
|
|
55
|
+
|
|
56
|
+
tests-miner-u:
|
|
57
|
+
runs-on: ubuntu-latest
|
|
58
|
+
env:
|
|
59
|
+
PYTHON_VERSION: 3.12
|
|
60
|
+
ASTRAL_VERSION: 0.11.6
|
|
61
|
+
steps:
|
|
62
|
+
- uses: actions/checkout@v6
|
|
63
|
+
- name: Setup Python project
|
|
64
|
+
uses: actions/setup-python@v6
|
|
65
|
+
with:
|
|
66
|
+
python-version: ${{ env.PYTHON_VERSION }}
|
|
67
|
+
- name: Install uv
|
|
68
|
+
uses: astral-sh/setup-uv@v7
|
|
69
|
+
with:
|
|
70
|
+
version: ${{ env.ASTRAL_VERSION }}
|
|
71
|
+
python-version: ${{ env.PYTHON_VERSION }}
|
|
72
|
+
enable-cache: true
|
|
73
|
+
- name: Run tests
|
|
74
|
+
run: |
|
|
75
|
+
uv run --dev --extra mineru --frozen pytest -m "miner_u" -vvv --cache-clear --show-capture=all -r A tests
|
|
76
|
+
|
|
77
|
+
concurrency:
|
|
78
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
79
|
+
cancel-in-progress: true
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.14.0
|
|
2
|
+
FROM python:3.10-slim-bullseye AS python-base
|
|
3
|
+
|
|
4
|
+
ENV HOME=/home/user
|
|
5
|
+
WORKDIR $HOME
|
|
6
|
+
RUN apt-get update && apt-get install -y curl
|
|
7
|
+
|
|
8
|
+
RUN curl -LsSf https://astral.sh/uv/0.6.7/install.sh | sh
|
|
9
|
+
ENV PATH="$HOME/.local/bin:$PATH"
|
|
10
|
+
ENV UV_LINK_MODE=copy
|
|
11
|
+
ENV UV_COMPILE_BYTECODE=1
|
|
12
|
+
|
|
13
|
+
FROM python-base AS worker-base
|
|
14
|
+
|
|
15
|
+
ARG dbmate_arch
|
|
16
|
+
WORKDIR $HOME/src/app
|
|
17
|
+
RUN curl -fsSL -o /usr/local/bin/dbmate https://github.com/amacneil/dbmate/releases/download/v2.19.0/dbmate-linux-${dbmate_arch} \
|
|
18
|
+
&& chmod +x /usr/local/bin/dbmate
|
|
19
|
+
|
|
20
|
+
FROM worker-base AS worker-cpu
|
|
21
|
+
# TODO: add more languages here
|
|
22
|
+
RUN apt-get install -y tesseract-ocr \
|
|
23
|
+
tesseract-ocr-eng \
|
|
24
|
+
tesseract-ocr-fra \
|
|
25
|
+
tesseract-ocr-deu \
|
|
26
|
+
tesseract-ocr-spa \
|
|
27
|
+
tesseract-ocr-lat \
|
|
28
|
+
tesseract-ocr-jpn \
|
|
29
|
+
libtesseract-dev \
|
|
30
|
+
libleptonica-dev \
|
|
31
|
+
pkg-config
|
|
32
|
+
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
|
|
33
|
+
# We skip opencv since we already depend on opencv-python-headless which is the lib we need to use
|
|
34
|
+
# Install deps first to optimize layer cache
|
|
35
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
36
|
+
--mount=type=bind,source=uv.lock,target=uv.lock \
|
|
37
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
38
|
+
uv sync -v --frozen --no-editable --no-sources --no-install-project --no-install-package opencv-python --extra cpu
|
|
39
|
+
RUN uv run --no-sync docling-tools models download -o ~/.cache/docling/models
|
|
40
|
+
# Then copy code
|
|
41
|
+
ADD uv.lock pyproject.toml README.md ./
|
|
42
|
+
ADD extract_python ./extract_python/
|
|
43
|
+
# Then install service
|
|
44
|
+
RUN uv sync -v --frozen --no-editable --no-sources --no-install-package opencv-python --extra cpu
|
|
45
|
+
|
|
46
|
+
RUN rm -rf ~/.cache/pip $(uv cache dir)
|
|
47
|
+
|
|
48
|
+
ENTRYPOINT ["uv", "run", "--no-sync", "icij-worker", "workers", "start", "-g", "cpu", "extract_python.app:app"]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
FROM worker-base AS worker-miner-u
|
|
52
|
+
RUN apt-get update && apt-get install -y wget
|
|
53
|
+
# We skip opencv since we already depend on opencv-python-headless which is the lib we need to use
|
|
54
|
+
# Install deps first to optimize layer cache
|
|
55
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
56
|
+
--mount=type=bind,source=uv.lock,target=uv.lock \
|
|
57
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
58
|
+
uv sync -v --frozen --no-editable --no-sources --no-install-project --no-install-package opencv-python --extra miner-u
|
|
59
|
+
# TODO: elegantly handle the version here...
|
|
60
|
+
# Download models
|
|
61
|
+
RUN wget "https://raw.githubusercontent.com/opendatalab/MinerU/refs/tags/magic_pdf-1.3.1-released/scripts/download_models_hf.py" -O download_models_hf.py \
|
|
62
|
+
&& uv run python download_models_hf.py
|
|
63
|
+
# Then copy code
|
|
64
|
+
ADD uv.lock pyproject.toml README.md ./
|
|
65
|
+
ADD extract_python ./extract_python/
|
|
66
|
+
# Then install service
|
|
67
|
+
RUN uv sync -v --frozen --no-editable --no-sources --no-install-package opencv-python --extra miner-u
|
|
68
|
+
|
|
69
|
+
RUN rm -rf ~/.cache/pip $(uv cache dir)
|
|
70
|
+
|
|
71
|
+
ENTRYPOINT ["uv", "run", "--no-sync", "icij-worker", "workers", "start", "-g", "miner-u", "extract_python.app:app"]
|
|
72
|
+
|
|
73
|
+
FROM icij/task-service:icij-worker-0.17.21 AS http-service
|
|
74
|
+
ADD uv.lock pyproject.toml README.md ./extract-python/
|
|
75
|
+
ADD extract_python ./extract-python/extract_python/
|
|
76
|
+
RUN uv pip install -e ./extract-python
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": null,
|
|
6
|
+
"id": "312f3d3220d0b3a0",
|
|
7
|
+
"metadata": {},
|
|
8
|
+
"outputs": [],
|
|
9
|
+
"source": [
|
|
10
|
+
"import shutil\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"from extract_python.benches import DATA_PATH, TEST_DATA_PATH\n",
|
|
13
|
+
"from extract_python.benches.compare import (\n",
|
|
14
|
+
" compare,\n",
|
|
15
|
+
")\n",
|
|
16
|
+
"from extract_python.pipelines import DoclingPipeline, MarkerPipeline\n",
|
|
17
|
+
"from extract_python.objects import InputDoc, OutputFormat"
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"cell_type": "code",
|
|
22
|
+
"execution_count": null,
|
|
23
|
+
"id": "0a36e830-3fe8-4d65-9b46-f398300e22d9",
|
|
24
|
+
"metadata": {},
|
|
25
|
+
"outputs": [],
|
|
26
|
+
"source": [
|
|
27
|
+
"pdfs = [TEST_DATA_PATH / \"computer_generated.pdf\", TEST_DATA_PATH / \"scanned.pdf\"]\n",
|
|
28
|
+
"work_dir = DATA_PATH / \"workdir\"\n",
|
|
29
|
+
"comparison_dir = work_dir / \"comparison\"\n",
|
|
30
|
+
"docs = [InputDoc.from_path(pdf_path) for pdf_path in pdfs]"
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"cell_type": "code",
|
|
35
|
+
"execution_count": null,
|
|
36
|
+
"id": "6aba45d151eb7302",
|
|
37
|
+
"metadata": {},
|
|
38
|
+
"outputs": [],
|
|
39
|
+
"source": [
|
|
40
|
+
"from extract_python.pipelines import Pipeline\n",
|
|
41
|
+
"\n",
|
|
42
|
+
"pipelines: list[Pipeline] = [DoclingPipeline(), MarkerPipeline()]"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "code",
|
|
47
|
+
"execution_count": null,
|
|
48
|
+
"id": "d13e58e4c286134b",
|
|
49
|
+
"metadata": {},
|
|
50
|
+
"outputs": [],
|
|
51
|
+
"source": [
|
|
52
|
+
"from extract_python.objects import Result\n",
|
|
53
|
+
"\n",
|
|
54
|
+
"if work_dir.exists():\n",
|
|
55
|
+
" shutil.rmtree(work_dir)\n",
|
|
56
|
+
"\n",
|
|
57
|
+
"for p in pipelines:\n",
|
|
58
|
+
" pipeline_dir = p.registered_name.lower().replace(\"pipeline\", \"\")\n",
|
|
59
|
+
" output_path = work_dir / pipeline_dir\n",
|
|
60
|
+
" output_path.mkdir(parents=True, exist_ok=True)\n",
|
|
61
|
+
" mds: list[Result] = [\n",
|
|
62
|
+
" r\n",
|
|
63
|
+
" async for r in p.extract_content( # noqa: PLE1142\n",
|
|
64
|
+
" docs, output_format=OutputFormat.MARKDOWN, output_path=output_path\n",
|
|
65
|
+
" )\n",
|
|
66
|
+
" ]\n",
|
|
67
|
+
" for md in mds:\n",
|
|
68
|
+
" pages_path = output_path / md.output.path / \"artifacts\" / \"pages.json\"\n",
|
|
69
|
+
" pages_path.write_text(md.output.pages.model_dump_json())"
|
|
70
|
+
]
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"cell_type": "code",
|
|
74
|
+
"execution_count": null,
|
|
75
|
+
"id": "75a4e7093ac41453",
|
|
76
|
+
"metadata": {},
|
|
77
|
+
"outputs": [],
|
|
78
|
+
"source": [
|
|
79
|
+
"from extract_python.benches.compare import discover_comparison\n",
|
|
80
|
+
"\n",
|
|
81
|
+
"if comparison_dir.exists():\n",
|
|
82
|
+
" shutil.rmtree(comparison_dir)\n",
|
|
83
|
+
"\n",
|
|
84
|
+
"compare(discover_comparison(pdfs, work_dir), root=work_dir, output_path=comparison_dir)"
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"cell_type": "code",
|
|
89
|
+
"execution_count": null,
|
|
90
|
+
"id": "814741bf4798974f",
|
|
91
|
+
"metadata": {},
|
|
92
|
+
"outputs": [],
|
|
93
|
+
"source": []
|
|
94
|
+
}
|
|
95
|
+
],
|
|
96
|
+
"metadata": {
|
|
97
|
+
"kernelspec": {
|
|
98
|
+
"display_name": "Python 3 (ipykernel)",
|
|
99
|
+
"language": "python",
|
|
100
|
+
"name": "python3"
|
|
101
|
+
},
|
|
102
|
+
"language_info": {
|
|
103
|
+
"codemirror_mode": {
|
|
104
|
+
"name": "ipython",
|
|
105
|
+
"version": 3
|
|
106
|
+
},
|
|
107
|
+
"file_extension": ".py",
|
|
108
|
+
"mimetype": "text/x-python",
|
|
109
|
+
"name": "python",
|
|
110
|
+
"nbconvert_exporter": "python",
|
|
111
|
+
"pygments_lexer": "ipython3",
|
|
112
|
+
"version": "3.10.11"
|
|
113
|
+
}
|
|
114
|
+
},
|
|
115
|
+
"nbformat": 4,
|
|
116
|
+
"nbformat_minor": 5
|
|
117
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
import markdown2
|
|
5
|
+
import pypdfium2
|
|
6
|
+
from extract_python.pipelines.utils import chdir
|
|
7
|
+
from html2image import Html2Image
|
|
8
|
+
from PIL import Image, ImageDraw
|
|
9
|
+
|
|
10
|
+
from extract_python.objects import BaseModel, OutputFormat, PageIndexes
|
|
11
|
+
|
|
12
|
+
_WHITE_BACKGROUND_CSS = "body {background: white;}"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ComparisonItem(BaseModel):
|
|
16
|
+
ref: Path
|
|
17
|
+
compared: list[Path]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compare(comparisons: list[ComparisonItem], root: Path, output_path: Path) -> None:
|
|
21
|
+
output_path.mkdir(parents=True)
|
|
22
|
+
if not comparisons:
|
|
23
|
+
return
|
|
24
|
+
first_item = comparisons[0]
|
|
25
|
+
if not first_item.compared:
|
|
26
|
+
return
|
|
27
|
+
if (root / first_item.compared[0]).is_dir():
|
|
28
|
+
output_format = OutputFormat.MARKDOWN
|
|
29
|
+
else:
|
|
30
|
+
output_format = OutputFormat[first_item.compared[0].suffix]
|
|
31
|
+
match output_format:
|
|
32
|
+
case OutputFormat.MARKDOWN:
|
|
33
|
+
side_by_side_page_comp_fn = side_by_side_md_page_comp
|
|
34
|
+
case _:
|
|
35
|
+
raise ValueError(f"unsupported output format {output_format}")
|
|
36
|
+
for comparison in comparisons:
|
|
37
|
+
if not comparison.compared:
|
|
38
|
+
continue
|
|
39
|
+
# We flatten everything and will fail if 2 refs have the same file name, even
|
|
40
|
+
# if they have different paths. To be improved, potentially using nested
|
|
41
|
+
# structure or concatenating the path into a single dir name
|
|
42
|
+
comparison_dir = output_path / path_to_compared_name(comparison.ref)
|
|
43
|
+
comparison_dir.mkdir()
|
|
44
|
+
ref_path = root / comparison.ref
|
|
45
|
+
ref_pdf = pypdfium2.PdfDocument(ref_path)
|
|
46
|
+
# TODO: create TIFF or PDF
|
|
47
|
+
pages = _scan_pages(root, comparison)
|
|
48
|
+
for page_i, page_idxs in enumerate(pages):
|
|
49
|
+
pdf_page_im = ref_pdf.get_page(page_i).render().to_pil()
|
|
50
|
+
page_comparisons = []
|
|
51
|
+
for compared in comparison.compared:
|
|
52
|
+
compared_name = compared.parent.name
|
|
53
|
+
page_ix = page_idxs[compared_name]
|
|
54
|
+
page_comp_im = side_by_side_page_comp_fn(
|
|
55
|
+
ref_im=pdf_page_im,
|
|
56
|
+
compared_path=root / compared,
|
|
57
|
+
page_ix=page_ix,
|
|
58
|
+
compared_name=compared_name,
|
|
59
|
+
)
|
|
60
|
+
page_comparisons.append(page_comp_im)
|
|
61
|
+
page_comparison_path = comparison_dir / f"page_{page_i}.tiff"
|
|
62
|
+
page_comparisons[0].save(
|
|
63
|
+
page_comparison_path, save_all=True, append_images=page_comparisons[1:]
|
|
64
|
+
)
|
|
65
|
+
for p in page_comparisons:
|
|
66
|
+
p.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def discover_comparison(refs: list[Path], root: Path) -> list[ComparisonItem]:
|
|
70
|
+
name_to_ref = {path_to_compared_name(r): r for r in refs}
|
|
71
|
+
comparisons = {r: [] for r in refs}
|
|
72
|
+
for d in root.iterdir():
|
|
73
|
+
if not d.is_dir():
|
|
74
|
+
continue
|
|
75
|
+
for parsing in d.iterdir():
|
|
76
|
+
ref = name_to_ref.get(parsing.name)
|
|
77
|
+
if ref is None:
|
|
78
|
+
continue
|
|
79
|
+
comparisons[ref].append(parsing.relative_to(root))
|
|
80
|
+
comparisons = [
|
|
81
|
+
ComparisonItem(ref=ref, compared=compared)
|
|
82
|
+
for ref, compared in comparisons.items()
|
|
83
|
+
]
|
|
84
|
+
return comparisons
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def side_by_side_md_page_comp(
|
|
88
|
+
ref_im: Image,
|
|
89
|
+
compared_path: Path,
|
|
90
|
+
page_ix: tuple[int, int],
|
|
91
|
+
compared_name: str,
|
|
92
|
+
) -> Image.Image:
|
|
93
|
+
md_files = list(compared_path.glob("*.md"))
|
|
94
|
+
if len(md_files) != 1:
|
|
95
|
+
msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
|
|
96
|
+
raise ValueError(msg)
|
|
97
|
+
md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
|
|
98
|
+
# change the current dir so that the browser renders images properly
|
|
99
|
+
with chdir(compared_path):
|
|
100
|
+
md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
|
|
101
|
+
ref_im = _add_compared_name(ref_im, compared_name)
|
|
102
|
+
comparison_im = Image.new("RGB", (ref_im.width * 2, ref_im.height))
|
|
103
|
+
comparison_im.paste(ref_im, (0, 0))
|
|
104
|
+
comparison_im.paste(md_page_im, (ref_im.width, 0))
|
|
105
|
+
ref_im.close()
|
|
106
|
+
md_page_im.close()
|
|
107
|
+
return comparison_im
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def path_to_compared_name(path: Path) -> str:
|
|
111
|
+
return f"{path.stem}_{path.suffix.replace('.', '')}"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _add_compared_name(ref_im: Image, compared_name: str) -> Image:
|
|
115
|
+
with_name = ref_im.copy()
|
|
116
|
+
d = ImageDraw.Draw(with_name)
|
|
117
|
+
d.text((0, 0), compared_name, font_size=24, fill=(255, 0, 0))
|
|
118
|
+
return with_name
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _render_md(md_content: str, md_path: Path, html_size: tuple[int, int]) -> Image:
|
|
122
|
+
with TemporaryDirectory() as tmpdir:
|
|
123
|
+
# TODO: check that we're handling images correctly,
|
|
124
|
+
# maybe make md images absolute or something like this
|
|
125
|
+
hti = Html2Image(size=html_size, output_path=tmpdir)
|
|
126
|
+
html = markdown2.markdown(md_content)
|
|
127
|
+
html = html.replace('<img src="', f'<img src="file://{md_path.absolute()}/')
|
|
128
|
+
screen_files = hti.screenshot(html_str=html, css_str=_WHITE_BACKGROUND_CSS)
|
|
129
|
+
if len(screen_files) > 1:
|
|
130
|
+
msg = (
|
|
131
|
+
"unexpected state, found multiple screenshots, "
|
|
132
|
+
"either set a large html_size or find a way to combine them into a"
|
|
133
|
+
" single image"
|
|
134
|
+
)
|
|
135
|
+
raise RuntimeError(msg)
|
|
136
|
+
im_path = screen_files[0]
|
|
137
|
+
return Image.open(im_path)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _scan_pages(
|
|
141
|
+
root: Path, comparison: ComparisonItem
|
|
142
|
+
) -> list[dict[str, tuple[int, int]]]:
|
|
143
|
+
all_pages = [
|
|
144
|
+
PageIndexes.model_validate_json(
|
|
145
|
+
(root / compared / "artifacts" / "pages.json").read_text()
|
|
146
|
+
).root
|
|
147
|
+
for compared in comparison.compared
|
|
148
|
+
]
|
|
149
|
+
all_pages = zip(*all_pages)
|
|
150
|
+
compared_names = (p.parent.name for p in comparison.compared)
|
|
151
|
+
pages = [dict(zip(compared_names, page_comp_ixs)) for page_comp_ixs in all_pages]
|
|
152
|
+
return pages
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
version: '3.7'
|
|
2
|
+
|
|
3
|
+
x-tm-amqp-config-variables: &tm-amqp-config
|
|
4
|
+
TASK_MANAGER__RABBITMQ_HOST: rabbitmq
|
|
5
|
+
|
|
6
|
+
x-postgres-storage-config: &tm-postgres-storage-config
|
|
7
|
+
TASK_MANAGER__BACKEND: amqp
|
|
8
|
+
# Change this to a FSKeyValueStorageConfig if you don't want to use postgres
|
|
9
|
+
TASK_MANAGER__STORAGE__HOST: postgres
|
|
10
|
+
TASK_MANAGER__STORAGE__PORT: 5432
|
|
11
|
+
TASK_MANAGER__STORAGE__PASSWORD: changeme
|
|
12
|
+
|
|
13
|
+
x-worker-config-variables: &worker-config
|
|
14
|
+
ICIJ_WORKER_TYPE: amqp
|
|
15
|
+
ICIJ_WORKER_RABBITMQ_HOST: rabbitmq
|
|
16
|
+
ICIJ_WORKER_RABBITMQ_PORT: 5672
|
|
17
|
+
|
|
18
|
+
x-async-app-variables: &async-app
|
|
19
|
+
EXTRACT_DATA_DIR: /usr/src/data
|
|
20
|
+
EXTRACT_WORK_DIR: /usr/src/data/workdir
|
|
21
|
+
EXTRACT_LOG_LEVEL: DEBUG
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
services:
|
|
25
|
+
rabbitmq:
|
|
26
|
+
image: rabbitmq:3.12.0-management
|
|
27
|
+
container_name: extract-rabbitmq
|
|
28
|
+
healthcheck:
|
|
29
|
+
test: rabbitmq-diagnostics -q status
|
|
30
|
+
interval: 5s
|
|
31
|
+
timeout: 2s
|
|
32
|
+
retries: 10
|
|
33
|
+
start_period: 5s
|
|
34
|
+
ports:
|
|
35
|
+
- "5672:5672"
|
|
36
|
+
- "15672:15672"
|
|
37
|
+
|
|
38
|
+
postgres:
|
|
39
|
+
image: postgres
|
|
40
|
+
container_name: extract-postgres
|
|
41
|
+
environment:
|
|
42
|
+
POSTGRES_PASSWORD: changeme
|
|
43
|
+
healthcheck:
|
|
44
|
+
test: pg_isready
|
|
45
|
+
interval: 2s
|
|
46
|
+
timeout: 2s
|
|
47
|
+
retries: 10
|
|
48
|
+
start_period: 5s
|
|
49
|
+
ports:
|
|
50
|
+
- "5435:5432"
|
|
51
|
+
|
|
52
|
+
http-service:
|
|
53
|
+
depends_on:
|
|
54
|
+
rabbitmq:
|
|
55
|
+
condition: service_healthy
|
|
56
|
+
postgres:
|
|
57
|
+
condition: service_healthy
|
|
58
|
+
build:
|
|
59
|
+
context: .
|
|
60
|
+
target: http-service
|
|
61
|
+
container_name: extract-http-service
|
|
62
|
+
environment:
|
|
63
|
+
<<: [ *tm-amqp-config, *tm-postgres-storage-config ]
|
|
64
|
+
PORT: "8000"
|
|
65
|
+
HOST: "0.0.0.0"
|
|
66
|
+
LOG_LEVEL: DEBUG
|
|
67
|
+
TASK_MANAGER__APP_PATH: extract_app
|
|
68
|
+
healthcheck:
|
|
69
|
+
test: curl -f http://localhost:8000/health
|
|
70
|
+
interval: 5s
|
|
71
|
+
timeout: 2s
|
|
72
|
+
retries: 10
|
|
73
|
+
start_period: 5s
|
|
74
|
+
ports:
|
|
75
|
+
- "8000:8000"
|
|
76
|
+
|
|
77
|
+
extract-worker-cpu:
|
|
78
|
+
depends_on:
|
|
79
|
+
http-service:
|
|
80
|
+
condition: service_healthy
|
|
81
|
+
build:
|
|
82
|
+
context: .
|
|
83
|
+
args:
|
|
84
|
+
dbmate_arch: $DBMATE_ARCH
|
|
85
|
+
target: worker-cpu
|
|
86
|
+
environment:
|
|
87
|
+
<<: [ *worker-config, *async-app ]
|
|
88
|
+
volumes:
|
|
89
|
+
- type: bind
|
|
90
|
+
source: ./data
|
|
91
|
+
target: /usr/src/data
|
|
92
|
+
|
|
93
|
+
extract-worker-miner-u:
|
|
94
|
+
depends_on:
|
|
95
|
+
http-service:
|
|
96
|
+
condition: service_healthy
|
|
97
|
+
build:
|
|
98
|
+
context: .
|
|
99
|
+
args:
|
|
100
|
+
dbmate_arch: $DBMATE_ARCH
|
|
101
|
+
target: worker-miner-u
|
|
102
|
+
environment:
|
|
103
|
+
<<: [ *worker-config, *async-app ]
|
|
104
|
+
volumes:
|
|
105
|
+
- type: bind
|
|
106
|
+
source: ./data
|
|
107
|
+
target: /usr/src/data
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
|
|
3
|
+
function _export_globals() {
|
|
4
|
+
DBMATE_ARCH=$(dbmate_arch)
|
|
5
|
+
export DBMATE_ARCH
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
function _helpers() {
|
|
9
|
+
function dbmate_arch() {
|
|
10
|
+
local host_arch
|
|
11
|
+
if command -v arch >/dev/null 2>&1; then
|
|
12
|
+
host_arch=$(arch)
|
|
13
|
+
else
|
|
14
|
+
host_arch=$(uname -m)
|
|
15
|
+
fi
|
|
16
|
+
local dbmate_arch_
|
|
17
|
+
if [ "$host_arch" == "x86_64" ] ||[ "$host_arch" == "amd64" ]; then
|
|
18
|
+
dbmate_arch_="amd64"
|
|
19
|
+
elif [ "$host_arch" == "aarch64" ] || [ "$host_arch" == "arm64" ]; then
|
|
20
|
+
dbmate_arch_="arm64"
|
|
21
|
+
elif [ "$host_arch" == "i386" ] ; then
|
|
22
|
+
dbmate_arch_="386"
|
|
23
|
+
else
|
|
24
|
+
_exit_with_message "Unsupported architecture $host_arch"
|
|
25
|
+
fi
|
|
26
|
+
echo "$dbmate_arch_"
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function _main() {
|
|
32
|
+
set -e
|
|
33
|
+
function _exit_with_message() {
|
|
34
|
+
echo "$1"
|
|
35
|
+
exit "${2:-1}"
|
|
36
|
+
}
|
|
37
|
+
_helpers
|
|
38
|
+
_export_globals
|
|
39
|
+
docker compose "$@"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
_main "$@"
|
|
@@ -32,6 +32,7 @@ from .objects import (
|
|
|
32
32
|
PageIndexes,
|
|
33
33
|
Result,
|
|
34
34
|
Status,
|
|
35
|
+
SupportedExt,
|
|
35
36
|
)
|
|
36
37
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
37
38
|
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
@@ -136,6 +137,30 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
136
137
|
for f, opt in self.format_options.items()
|
|
137
138
|
}
|
|
138
139
|
|
|
140
|
+
@classmethod
|
|
141
|
+
@cache
|
|
142
|
+
def supported_formats(cls) -> set[SupportedExt]:
|
|
143
|
+
# Subset of https://docling-project.github.io/docling/usage/supported_formats/
|
|
144
|
+
return {
|
|
145
|
+
SupportedExt.ADOC,
|
|
146
|
+
SupportedExt.ASCIIDOC,
|
|
147
|
+
SupportedExt.BMP,
|
|
148
|
+
SupportedExt.CSV,
|
|
149
|
+
SupportedExt.DOCX,
|
|
150
|
+
SupportedExt.HTLM,
|
|
151
|
+
SupportedExt.JPG,
|
|
152
|
+
SupportedExt.MD,
|
|
153
|
+
SupportedExt.PDF,
|
|
154
|
+
SupportedExt.PNG,
|
|
155
|
+
SupportedExt.PPTX,
|
|
156
|
+
SupportedExt.TEX,
|
|
157
|
+
SupportedExt.TIFF,
|
|
158
|
+
SupportedExt.TXT,
|
|
159
|
+
SupportedExt.WEBP,
|
|
160
|
+
SupportedExt.XHTML,
|
|
161
|
+
SupportedExt.XLSX,
|
|
162
|
+
}
|
|
163
|
+
|
|
139
164
|
|
|
140
165
|
DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
|
|
141
166
|
|