extract-python 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. extract_python-0.3.0/.dockerignore +6 -0
  2. extract_python-0.3.0/.github/workflows/publish.yml +45 -0
  3. extract_python-0.3.0/.github/workflows/tests.yml +79 -0
  4. extract_python-0.3.0/.python-version +1 -0
  5. extract_python-0.3.0/Dockerfile +76 -0
  6. {extract_python-0.1.0 → extract_python-0.3.0}/PKG-INFO +1 -1
  7. extract_python-0.3.0/benches/__init__.py +5 -0
  8. extract_python-0.3.0/benches/compare.ipynb +117 -0
  9. extract_python-0.3.0/benches/compare.py +152 -0
  10. extract_python-0.3.0/benches/constants.py +4 -0
  11. extract_python-0.3.0/data/.gitignore +2 -0
  12. extract_python-0.3.0/docker-compose.yml +107 -0
  13. extract_python-0.3.0/extract +42 -0
  14. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/docling_.py +47 -90
  15. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/marker_.py +29 -0
  16. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/miner_u.py +12 -1
  17. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/objects.py +70 -2
  18. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/pipeline.py +5 -1
  19. {extract_python-0.1.0 → extract_python-0.3.0}/pyproject.toml +22 -10
  20. extract_python-0.3.0/qa/ruff.toml +58 -0
  21. extract_python-0.3.0/uv.lock +5519 -0
  22. {extract_python-0.1.0 → extract_python-0.3.0}/.gitignore +0 -0
  23. {extract_python-0.1.0 → extract_python-0.3.0}/README.md +0 -0
  24. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/__init__.py +0 -0
  25. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/constants.py +0 -0
  26. {extract_python-0.1.0 → extract_python-0.3.0}/extract_python/utils.py +0 -0
@@ -0,0 +1,6 @@
1
+ *
2
+ !extract_python
3
+ !uv.lock
4
+ !pyproject.toml
5
+ !README.md
6
+
@@ -0,0 +1,45 @@
1
+ name: Publish extra-python
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ tags:
7
+ - '*'
8
+
9
+ jobs:
10
+ create-release:
11
+ runs-on: ubuntu-latest
12
+ env:
13
+ PYTHON_VERSION: 3.12
14
+ ASTRAL_VERSION: 0.11.6
15
+ steps:
16
+ - uses: actions/checkout@v6
17
+ - name: Create GH release
18
+ run: gh release create "$tag" --generate-notes
19
+ env:
20
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21
+ tag: ${{ github.ref_name }}
22
+
23
+ publish-extra-python-to-pypi:
24
+ runs-on: ubuntu-latest
25
+ permissions:
26
+ id-token: write
27
+ environment:
28
+ name: pypi
29
+ url: https://pypi.org/p/extract-python
30
+ steps:
31
+ - uses: actions/checkout@v6
32
+ - name: Install uv
33
+ uses: astral-sh/setup-uv@v7
34
+ with:
35
+ version: "0.10.8"
36
+ - name: Install Python 3.12
37
+ run: uv python install 3.12
38
+ - name: Build
39
+ run: uv build
40
+ - name: Publish
41
+ run: uv publish
42
+
43
+ concurrency:
44
+ group: ${{ github.workflow }}-${{ github.ref }}
45
+ cancel-in-progress: false
@@ -0,0 +1,79 @@
1
+ name: Tests extract-python
2
+
3
+ on:
4
+ push:
5
+ branches: [ 'main' ]
6
+ pull_request:
7
+
8
+ jobs:
9
+ lint:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v6
13
+ - uses: astral-sh/ruff-action@v3
14
+ with:
15
+ args: "--version" # skips test by displaying the version
16
+ - name: Check formatting
17
+ run: ruff format --config qa/ruff.toml --check extract_python tests
18
+ - name: Lint test
19
+ run: ruff check --config qa/ruff.toml extract_python tests
20
+
21
+ tests-no-miner-u:
22
+ runs-on: ubuntu-latest
23
+ env:
24
+ PYTHON_VERSION: 3.12
25
+ ASTRAL_VERSION: 0.11.6
26
+ steps:
27
+ - uses: actions/checkout@v6
28
+ - name: Setup Python project
29
+ uses: actions/setup-python@v6
30
+ with:
31
+ python-version: ${{ env.PYTHON_VERSION }}
32
+ - name: Install uv
33
+ uses: astral-sh/setup-uv@v7
34
+ with:
35
+ version: ${{ env.ASTRAL_VERSION }}
36
+ python-version: ${{ env.PYTHON_VERSION }}
37
+ enable-cache: true
38
+ - name: Install tesseract
39
+ run: |
40
+ sudo apt-get update
41
+ sudo apt-get install -y --fix-missing tesseract-ocr \
42
+ tesseract-ocr-eng \
43
+ tesseract-ocr-fra \
44
+ tesseract-ocr-deu \
45
+ tesseract-ocr-spa \
46
+ tesseract-ocr-lat \
47
+ tesseract-ocr-jpn \
48
+ libtesseract-dev \
49
+ libleptonica-dev \
50
+ pkg-config
51
+ echo "TESSDATA_PREFIX=$(sudo dpkg -L tesseract-ocr-eng | grep tessdata$)" >> $GITHUB_ENV
52
+ - name: Run tests
53
+ run: |
54
+ uv run --dev --extra docling --extra marker --frozen pytest -m "not miner_u" -vvv --cache-clear --show-capture=all -r A tests
55
+
56
+ tests-miner-u:
57
+ runs-on: ubuntu-latest
58
+ env:
59
+ PYTHON_VERSION: 3.12
60
+ ASTRAL_VERSION: 0.11.6
61
+ steps:
62
+ - uses: actions/checkout@v6
63
+ - name: Setup Python project
64
+ uses: actions/setup-python@v6
65
+ with:
66
+ python-version: ${{ env.PYTHON_VERSION }}
67
+ - name: Install uv
68
+ uses: astral-sh/setup-uv@v7
69
+ with:
70
+ version: ${{ env.ASTRAL_VERSION }}
71
+ python-version: ${{ env.PYTHON_VERSION }}
72
+ enable-cache: true
73
+ - name: Run tests
74
+ run: |
75
+ uv run --dev --extra mineru --frozen pytest -m "miner_u" -vvv --cache-clear --show-capture=all -r A tests
76
+
77
+ concurrency:
78
+ group: ${{ github.workflow }}-${{ github.ref }}
79
+ cancel-in-progress: true
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,76 @@
1
+ # syntax=docker/dockerfile:1.14.0
2
+ FROM python:3.10-slim-bullseye AS python-base
3
+
4
+ ENV HOME=/home/user
5
+ WORKDIR $HOME
6
+ RUN apt-get update && apt-get install -y curl
7
+
8
+ RUN curl -LsSf https://astral.sh/uv/0.6.7/install.sh | sh
9
+ ENV PATH="$HOME/.local/bin:$PATH"
10
+ ENV UV_LINK_MODE=copy
11
+ ENV UV_COMPILE_BYTECODE=1
12
+
13
+ FROM python-base AS worker-base
14
+
15
+ ARG dbmate_arch
16
+ WORKDIR $HOME/src/app
17
+ RUN curl -fsSL -o /usr/local/bin/dbmate https://github.com/amacneil/dbmate/releases/download/v2.19.0/dbmate-linux-${dbmate_arch} \
18
+ && chmod +x /usr/local/bin/dbmate
19
+
20
+ FROM worker-base AS worker-cpu
21
+ # TODO: add more languages here
22
+ RUN apt-get install -y tesseract-ocr \
23
+ tesseract-ocr-eng \
24
+ tesseract-ocr-fra \
25
+ tesseract-ocr-deu \
26
+ tesseract-ocr-spa \
27
+ tesseract-ocr-lat \
28
+ tesseract-ocr-jpn \
29
+ libtesseract-dev \
30
+ libleptonica-dev \
31
+ pkg-config
32
+ ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
33
+ # We skip opencv since we already depend on opencv-python-headless which is the lib we need to use
34
+ # Install deps first to optimize layer cache
35
+ RUN --mount=type=cache,target=~/.cache/uv \
36
+ --mount=type=bind,source=uv.lock,target=uv.lock \
37
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
38
+ uv sync -v --frozen --no-editable --no-sources --no-install-project --no-install-package opencv-python --extra cpu
39
+ RUN uv run --no-sync docling-tools models download -o ~/.cache/docling/models
40
+ # Then copy code
41
+ ADD uv.lock pyproject.toml README.md ./
42
+ ADD extract_python ./extract_python/
43
+ # Then install service
44
+ RUN uv sync -v --frozen --no-editable --no-sources --no-install-package opencv-python --extra cpu
45
+
46
+ RUN rm -rf ~/.cache/pip $(uv cache dir)
47
+
48
+ ENTRYPOINT ["uv", "run", "--no-sync", "icij-worker", "workers", "start", "-g", "cpu", "extract_python.app:app"]
49
+
50
+
51
+ FROM worker-base AS worker-miner-u
52
+ RUN apt-get update && apt-get install -y wget
53
+ # We skip opencv since we already depend on opencv-python-headless which is the lib we need to use
54
+ # Install deps first to optimize layer cache
55
+ RUN --mount=type=cache,target=~/.cache/uv \
56
+ --mount=type=bind,source=uv.lock,target=uv.lock \
57
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
58
+ uv sync -v --frozen --no-editable --no-sources --no-install-project --no-install-package opencv-python --extra miner-u
59
+ # TODO: elegantly handle the version here...
60
+ # Download models
61
+ RUN wget "https://raw.githubusercontent.com/opendatalab/MinerU/refs/tags/magic_pdf-1.3.1-released/scripts/download_models_hf.py" -O download_models_hf.py \
62
+ && uv run python download_models_hf.py
63
+ # Then copy code
64
+ ADD uv.lock pyproject.toml README.md ./
65
+ ADD extract_python ./extract_python/
66
+ # Then install service
67
+ RUN uv sync -v --frozen --no-editable --no-sources --no-install-package opencv-python --extra miner-u
68
+
69
+ RUN rm -rf ~/.cache/pip $(uv cache dir)
70
+
71
+ ENTRYPOINT ["uv", "run", "--no-sync", "icij-worker", "workers", "start", "-g", "miner-u", "extract_python.app:app"]
72
+
73
+ FROM icij/task-service:icij-worker-0.17.21 AS http-service
74
+ ADD uv.lock pyproject.toml README.md ./extract-python/
75
+ ADD extract_python ./extract-python/extract_python/
76
+ RUN uv pip install -e ./extract-python
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,5 @@
1
+ from pathlib import Path
2
+
3
+ ROOT_PATH = Path(__file__).parents[1]
4
+ DATA_PATH = ROOT_PATH / "data"
5
+ TEST_DATA_PATH = ROOT_PATH / "tests" / "data"
@@ -0,0 +1,117 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "312f3d3220d0b3a0",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import shutil\n",
11
+ "\n",
12
+ "from extract_python.benches import DATA_PATH, TEST_DATA_PATH\n",
13
+ "from extract_python.benches.compare import (\n",
14
+ " compare,\n",
15
+ ")\n",
16
+ "from extract_python.pipelines import DoclingPipeline, MarkerPipeline\n",
17
+ "from extract_python.objects import InputDoc, OutputFormat"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "id": "0a36e830-3fe8-4d65-9b46-f398300e22d9",
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "pdfs = [TEST_DATA_PATH / \"computer_generated.pdf\", TEST_DATA_PATH / \"scanned.pdf\"]\n",
28
+ "work_dir = DATA_PATH / \"workdir\"\n",
29
+ "comparison_dir = work_dir / \"comparison\"\n",
30
+ "docs = [InputDoc.from_path(pdf_path) for pdf_path in pdfs]"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "id": "6aba45d151eb7302",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "from extract_python.pipelines import Pipeline\n",
41
+ "\n",
42
+ "pipelines: list[Pipeline] = [DoclingPipeline(), MarkerPipeline()]"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "id": "d13e58e4c286134b",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "from extract_python.objects import Result\n",
53
+ "\n",
54
+ "if work_dir.exists():\n",
55
+ " shutil.rmtree(work_dir)\n",
56
+ "\n",
57
+ "for p in pipelines:\n",
58
+ " pipeline_dir = p.registered_name.lower().replace(\"pipeline\", \"\")\n",
59
+ " output_path = work_dir / pipeline_dir\n",
60
+ " output_path.mkdir(parents=True, exist_ok=True)\n",
61
+ " mds: list[Result] = [\n",
62
+ " r\n",
63
+ " async for r in p.extract_content( # noqa: PLE1142\n",
64
+ " docs, output_format=OutputFormat.MARKDOWN, output_path=output_path\n",
65
+ " )\n",
66
+ " ]\n",
67
+ " for md in mds:\n",
68
+ " pages_path = output_path / md.output.path / \"artifacts\" / \"pages.json\"\n",
69
+ " pages_path.write_text(md.output.pages.model_dump_json())"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "id": "75a4e7093ac41453",
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "from extract_python.benches.compare import discover_comparison\n",
80
+ "\n",
81
+ "if comparison_dir.exists():\n",
82
+ " shutil.rmtree(comparison_dir)\n",
83
+ "\n",
84
+ "compare(discover_comparison(pdfs, work_dir), root=work_dir, output_path=comparison_dir)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "id": "814741bf4798974f",
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": []
94
+ }
95
+ ],
96
+ "metadata": {
97
+ "kernelspec": {
98
+ "display_name": "Python 3 (ipykernel)",
99
+ "language": "python",
100
+ "name": "python3"
101
+ },
102
+ "language_info": {
103
+ "codemirror_mode": {
104
+ "name": "ipython",
105
+ "version": 3
106
+ },
107
+ "file_extension": ".py",
108
+ "mimetype": "text/x-python",
109
+ "name": "python",
110
+ "nbconvert_exporter": "python",
111
+ "pygments_lexer": "ipython3",
112
+ "version": "3.10.11"
113
+ }
114
+ },
115
+ "nbformat": 4,
116
+ "nbformat_minor": 5
117
+ }
@@ -0,0 +1,152 @@
1
+ from pathlib import Path
2
+ from tempfile import TemporaryDirectory
3
+
4
+ import markdown2
5
+ import pypdfium2
6
+ from extract_python.pipelines.utils import chdir
7
+ from html2image import Html2Image
8
+ from PIL import Image, ImageDraw
9
+
10
+ from extract_python.objects import BaseModel, OutputFormat, PageIndexes
11
+
12
+ _WHITE_BACKGROUND_CSS = "body {background: white;}"
13
+
14
+
15
+ class ComparisonItem(BaseModel):
16
+ ref: Path
17
+ compared: list[Path]
18
+
19
+
20
+ def compare(comparisons: list[ComparisonItem], root: Path, output_path: Path) -> None:
21
+ output_path.mkdir(parents=True)
22
+ if not comparisons:
23
+ return
24
+ first_item = comparisons[0]
25
+ if not first_item.compared:
26
+ return
27
+ if (root / first_item.compared[0]).is_dir():
28
+ output_format = OutputFormat.MARKDOWN
29
+ else:
30
+ output_format = OutputFormat[first_item.compared[0].suffix]
31
+ match output_format:
32
+ case OutputFormat.MARKDOWN:
33
+ side_by_side_page_comp_fn = side_by_side_md_page_comp
34
+ case _:
35
+ raise ValueError(f"unsupported output format {output_format}")
36
+ for comparison in comparisons:
37
+ if not comparison.compared:
38
+ continue
39
+ # We flatten everything and will fail if 2 refs have the same file name, even
40
+ # if they have different paths. To be improved, potentially using nested
41
+ # structure or concatenating the path into a single dir name
42
+ comparison_dir = output_path / path_to_compared_name(comparison.ref)
43
+ comparison_dir.mkdir()
44
+ ref_path = root / comparison.ref
45
+ ref_pdf = pypdfium2.PdfDocument(ref_path)
46
+ # TODO: create TIFF or PDF
47
+ pages = _scan_pages(root, comparison)
48
+ for page_i, page_idxs in enumerate(pages):
49
+ pdf_page_im = ref_pdf.get_page(page_i).render().to_pil()
50
+ page_comparisons = []
51
+ for compared in comparison.compared:
52
+ compared_name = compared.parent.name
53
+ page_ix = page_idxs[compared_name]
54
+ page_comp_im = side_by_side_page_comp_fn(
55
+ ref_im=pdf_page_im,
56
+ compared_path=root / compared,
57
+ page_ix=page_ix,
58
+ compared_name=compared_name,
59
+ )
60
+ page_comparisons.append(page_comp_im)
61
+ page_comparison_path = comparison_dir / f"page_{page_i}.tiff"
62
+ page_comparisons[0].save(
63
+ page_comparison_path, save_all=True, append_images=page_comparisons[1:]
64
+ )
65
+ for p in page_comparisons:
66
+ p.close()
67
+
68
+
69
+ def discover_comparison(refs: list[Path], root: Path) -> list[ComparisonItem]:
70
+ name_to_ref = {path_to_compared_name(r): r for r in refs}
71
+ comparisons = {r: [] for r in refs}
72
+ for d in root.iterdir():
73
+ if not d.is_dir():
74
+ continue
75
+ for parsing in d.iterdir():
76
+ ref = name_to_ref.get(parsing.name)
77
+ if ref is None:
78
+ continue
79
+ comparisons[ref].append(parsing.relative_to(root))
80
+ comparisons = [
81
+ ComparisonItem(ref=ref, compared=compared)
82
+ for ref, compared in comparisons.items()
83
+ ]
84
+ return comparisons
85
+
86
+
87
+ def side_by_side_md_page_comp(
88
+ ref_im: Image,
89
+ compared_path: Path,
90
+ page_ix: tuple[int, int],
91
+ compared_name: str,
92
+ ) -> Image.Image:
93
+ md_files = list(compared_path.glob("*.md"))
94
+ if len(md_files) != 1:
95
+ msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
96
+ raise ValueError(msg)
97
+ md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
98
+ # change the current dir so that the browser renders images properly
99
+ with chdir(compared_path):
100
+ md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
101
+ ref_im = _add_compared_name(ref_im, compared_name)
102
+ comparison_im = Image.new("RGB", (ref_im.width * 2, ref_im.height))
103
+ comparison_im.paste(ref_im, (0, 0))
104
+ comparison_im.paste(md_page_im, (ref_im.width, 0))
105
+ ref_im.close()
106
+ md_page_im.close()
107
+ return comparison_im
108
+
109
+
110
+ def path_to_compared_name(path: Path) -> str:
111
+ return f"{path.stem}_{path.suffix.replace('.', '')}"
112
+
113
+
114
+ def _add_compared_name(ref_im: Image, compared_name: str) -> Image:
115
+ with_name = ref_im.copy()
116
+ d = ImageDraw.Draw(with_name)
117
+ d.text((0, 0), compared_name, font_size=24, fill=(255, 0, 0))
118
+ return with_name
119
+
120
+
121
+ def _render_md(md_content: str, md_path: Path, html_size: tuple[int, int]) -> Image:
122
+ with TemporaryDirectory() as tmpdir:
123
+ # TODO: check that we're handling images correctly,
124
+ # maybe make md images absolute or something like this
125
+ hti = Html2Image(size=html_size, output_path=tmpdir)
126
+ html = markdown2.markdown(md_content)
127
+ html = html.replace('<img src="', f'<img src="file://{md_path.absolute()}/')
128
+ screen_files = hti.screenshot(html_str=html, css_str=_WHITE_BACKGROUND_CSS)
129
+ if len(screen_files) > 1:
130
+ msg = (
131
+ "unexpected state, found multiple screenshots, "
132
+ "either set a large html_size or find a way to combine them into a"
133
+ " single image"
134
+ )
135
+ raise RuntimeError(msg)
136
+ im_path = screen_files[0]
137
+ return Image.open(im_path)
138
+
139
+
140
+ def _scan_pages(
141
+ root: Path, comparison: ComparisonItem
142
+ ) -> list[dict[str, tuple[int, int]]]:
143
+ all_pages = [
144
+ PageIndexes.model_validate_json(
145
+ (root / compared / "artifacts" / "pages.json").read_text()
146
+ ).root
147
+ for compared in comparison.compared
148
+ ]
149
+ all_pages = zip(*all_pages)
150
+ compared_names = (p.parent.name for p in comparison.compared)
151
+ pages = [dict(zip(compared_names, page_comp_ixs)) for page_comp_ixs in all_pages]
152
+ return pages
@@ -0,0 +1,4 @@
1
+ from pathlib import Path
2
+
3
+ ROOT_PATH = Path(__file__).parents[1]
4
+ DATA_PATH = ROOT_PATH / "data"
@@ -0,0 +1,2 @@
1
+ *
2
+ !.gitignore
@@ -0,0 +1,107 @@
1
+ version: '3.7'
2
+
3
+ x-tm-amqp-config-variables: &tm-amqp-config
4
+ TASK_MANAGER__RABBITMQ_HOST: rabbitmq
5
+
6
+ x-postgres-storage-config: &tm-postgres-storage-config
7
+ TASK_MANAGER__BACKEND: amqp
8
+ # Change this to a FSKeyValueStorageConfig if you don't want to use postgres
9
+ TASK_MANAGER__STORAGE__HOST: postgres
10
+ TASK_MANAGER__STORAGE__PORT: 5432
11
+ TASK_MANAGER__STORAGE__PASSWORD: changeme
12
+
13
+ x-worker-config-variables: &worker-config
14
+ ICIJ_WORKER_TYPE: amqp
15
+ ICIJ_WORKER_RABBITMQ_HOST: rabbitmq
16
+ ICIJ_WORKER_RABBITMQ_PORT: 5672
17
+
18
+ x-async-app-variables: &async-app
19
+ EXTRACT_DATA_DIR: /usr/src/data
20
+ EXTRACT_WORK_DIR: /usr/src/data/workdir
21
+ EXTRACT_LOG_LEVEL: DEBUG
22
+
23
+
24
+ services:
25
+ rabbitmq:
26
+ image: rabbitmq:3.12.0-management
27
+ container_name: extract-rabbitmq
28
+ healthcheck:
29
+ test: rabbitmq-diagnostics -q status
30
+ interval: 5s
31
+ timeout: 2s
32
+ retries: 10
33
+ start_period: 5s
34
+ ports:
35
+ - "5672:5672"
36
+ - "15672:15672"
37
+
38
+ postgres:
39
+ image: postgres
40
+ container_name: extract-postgres
41
+ environment:
42
+ POSTGRES_PASSWORD: changeme
43
+ healthcheck:
44
+ test: pg_isready
45
+ interval: 2s
46
+ timeout: 2s
47
+ retries: 10
48
+ start_period: 5s
49
+ ports:
50
+ - "5435:5432"
51
+
52
+ http-service:
53
+ depends_on:
54
+ rabbitmq:
55
+ condition: service_healthy
56
+ postgres:
57
+ condition: service_healthy
58
+ build:
59
+ context: .
60
+ target: http-service
61
+ container_name: extract-http-service
62
+ environment:
63
+ <<: [ *tm-amqp-config, *tm-postgres-storage-config ]
64
+ PORT: "8000"
65
+ HOST: "0.0.0.0"
66
+ LOG_LEVEL: DEBUG
67
+ TASK_MANAGER__APP_PATH: extract_app
68
+ healthcheck:
69
+ test: curl -f http://localhost:8000/health
70
+ interval: 5s
71
+ timeout: 2s
72
+ retries: 10
73
+ start_period: 5s
74
+ ports:
75
+ - "8000:8000"
76
+
77
+ extract-worker-cpu:
78
+ depends_on:
79
+ http-service:
80
+ condition: service_healthy
81
+ build:
82
+ context: .
83
+ args:
84
+ dbmate_arch: $DBMATE_ARCH
85
+ target: worker-cpu
86
+ environment:
87
+ <<: [ *worker-config, *async-app ]
88
+ volumes:
89
+ - type: bind
90
+ source: ./data
91
+ target: /usr/src/data
92
+
93
+ extract-worker-miner-u:
94
+ depends_on:
95
+ http-service:
96
+ condition: service_healthy
97
+ build:
98
+ context: .
99
+ args:
100
+ dbmate_arch: $DBMATE_ARCH
101
+ target: worker-miner-u
102
+ environment:
103
+ <<: [ *worker-config, *async-app ]
104
+ volumes:
105
+ - type: bind
106
+ source: ./data
107
+ target: /usr/src/data
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env bash
2
+
3
+ function _export_globals() {
4
+ DBMATE_ARCH=$(dbmate_arch)
5
+ export DBMATE_ARCH
6
+ }
7
+
8
+ function _helpers() {
9
+ function dbmate_arch() {
10
+ local host_arch
11
+ if command -v arch >/dev/null 2>&1; then
12
+ host_arch=$(arch)
13
+ else
14
+ host_arch=$(uname -m)
15
+ fi
16
+ local dbmate_arch_
17
+ if [ "$host_arch" == "x86_64" ] ||[ "$host_arch" == "amd64" ]; then
18
+ dbmate_arch_="amd64"
19
+ elif [ "$host_arch" == "aarch64" ] || [ "$host_arch" == "arm64" ]; then
20
+ dbmate_arch_="arm64"
21
+ elif [ "$host_arch" == "i386" ] ; then
22
+ dbmate_arch_="386"
23
+ else
24
+ _exit_with_message "Unsupported architecture $host_arch"
25
+ fi
26
+ echo "$dbmate_arch_"
27
+ }
28
+
29
+ }
30
+
31
+ function _main() {
32
+ set -e
33
+ function _exit_with_message() {
34
+ echo "$1"
35
+ exit "${2:-1}"
36
+ }
37
+ _helpers
38
+ _export_globals
39
+ docker compose "$@"
40
+ }
41
+
42
+ _main "$@"