PyPI - paperless-ghostnode - Versions diffs - 0.1.0__py3-none-any.whl - Mend

paperless-ghostnode 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

paperless_ghostnode/__init__.py +0 -0
paperless_ghostnode/apps.py +14 -0
paperless_ghostnode/parsers.py +57 -0
paperless_ghostnode/signals.py +22 -0
paperless_ghostnode-0.1.0.dist-info/METADATA +139 -0
paperless_ghostnode-0.1.0.dist-info/RECORD +7 -0
paperless_ghostnode-0.1.0.dist-info/WHEEL +4 -0

paperless_ghostnode/__init__.py ADDED Viewed

File without changes

paperless_ghostnode/apps.py ADDED Viewed

@@ -0,0 +1,14 @@
+from django.apps import AppConfig
+class GhostNodeConfig(AppConfig):
+    name = "paperless_ghostnode"
+    def ready(self):
+        from documents.signals import document_consumer_declaration
+        from paperless_ghostnode.signals import ghostnode_consumer_declaration
+        document_consumer_declaration.connect(ghostnode_consumer_declaration)
+        AppConfig.ready(self)

paperless_ghostnode/parsers.py ADDED Viewed

@@ -0,0 +1,57 @@
+import os
+import urllib.parse
+import urllib.request
+from pathlib import Path
+from documents.parsers import ParseError
+from paperless_tesseract.parsers import RasterisedDocumentParser
+DEFAULT_MODE = "skip"
+class GhostNodeDocumentParser(RasterisedDocumentParser):
+    """Routes OCR through GhostNode instead of Tesseract."""
+    logging_name = "paperless.parsing.ghostnode"
+    def parse(self, document_path, mime_type, file_name=None):
+        hosts = [
+            host.strip()
+            for host in os.environ.get("PAPERLESS_GHOSTNODE_HOSTS", "").split(",")
+            if host.strip()
+        ]
+        if not hosts:
+            raise ParseError(
+                "PAPERLESS_GHOSTNODE_HOSTS is unset or empty, set it to a comma-separated list of host:port entries.",
+            )
+        mode = os.environ.get("PAPERLESS_GHOSTNODE_MODE", DEFAULT_MODE)
+        archive_path = Path(self.tempdir) / "archive.pdf"
+        data = Path(document_path).read_bytes()
+        query = urllib.parse.urlencode({"mode": mode})
+        last_error = None
+        for host in hosts:
+            url = f"http://{host}/api/v1/ocr?{query}"
+            try:
+                self.log.debug(f"Sending document to GhostNode at {host} (mode={mode})")
+                request = urllib.request.Request(
+                    url,
+                    data=data,
+                    headers={"Content-Type": mime_type},
+                    method="POST",
+                )
+                # urlopen raises HTTPError on non-2xx
+                with urllib.request.urlopen(request, timeout=300) as response:
+                    archive_path.write_bytes(response.read())
+                self.archive_path = archive_path
+                self.text = self.extract_text(None, archive_path)
+                self.log.debug(f"GhostNode OCR done via {host}")
+                return
+            except Exception as e:
+                last_error = e
+                self.log.warning(f"GhostNode host {host} failed: {e}")
+        raise ParseError(
+            f"GhostNode OCR failed on all hosts ({', '.join(hosts)}): {last_error}",
+        )

paperless_ghostnode/signals.py ADDED Viewed

@@ -0,0 +1,22 @@
+def get_parser(*args, **kwargs):
+    from paperless_ghostnode.parsers import GhostNodeDocumentParser
+    return GhostNodeDocumentParser(*args, **kwargs)
+def ghostnode_consumer_declaration(sender, **kwargs):
+    # weight 1 > Tesseract's 0, so GhostNode wins for these types
+    return {
+        "parser": get_parser,
+        "weight": 1,
+        "mime_types": {
+            "application/pdf": ".pdf",
+            "image/jpeg": ".jpg",
+            "image/png": ".png",
+            "image/tiff": ".tif",
+            "image/gif": ".gif",
+            "image/bmp": ".bmp",
+            "image/webp": ".webp",
+            "image/heic": ".heic",
+        },
+    }

paperless_ghostnode-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,139 @@
+Metadata-Version: 2.4
+Name: paperless-ghostnode
+Version: 0.1.0
+Summary: Paperless-ngx parser that routes OCR through GhostNode instead of Tesseract
+Project-URL: Homepage, https://github.com/sevenautumns/GhostNode
+License: Apache-2.0
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+# paperless-ghostnode
+paperless-ghostnode is a paperless plugin for routing OCR through GhostNode instead of Tesseract.
+The plugin registers a consumer which does a simple HTTP request to a number of GhostNodes.
+Should all GhostNodes fail, the processing fails.
+A failed processing is not too bad. It keeps the failed files in the consume-folder where they can be "retried" (unfortunately, there is no retry button, but a restart of paperless re-scans the folder and "re-runs" them).
+## Installation
+Depending on your
+### pip
+plain `pip install` of paperless-ghostnode
+```
+pip install paperless-ghostnode
+```
+### Docker Compose
+When using docker compose, we need to somehow give paperless access to the files of paperless-ghostnode.
+One of the easiest one is to have an init-service, which installs the plugin into a shared volume via pip and then mount that volume into the paperless-worker.
+```yaml
+services:
+  paperless-ghostnode-install:
+    image: python:slim
+    command: ["pip", "install", "--target", "/plugins", "paperless-ghostnode"]
+    volumes:
+      - paperless-ghostnode:/plugins
+  paperless-worker:
+    # ...
+    depends_on:
+      paperless-ghostnode-install:
+        condition: service_completed_successfully
+    volumes:
+      - paperless-ghostnode:/plugins
+    environment:
+      PYTHONPATH: /plugins
+      PAPERLESS_APPS: paperless_ghostnode
+      PAPERLESS_GHOSTNODE_HOSTS: "192.168.x.x:8080"
+volumes:
+  paperless-ghostnode: {}
+```
+### Nix
+When using nix, we can just load the plugin from PyPi, build it, and then directly attach it to the `PYTHONPATH` of the `paperless-task-queue` service.
+```nix
+let
+  paperless-ghostnode = pkgs.python3Packages.buildPythonPackage {
+    pname = "paperless-ghostnode";
+    version = "0.1.0";
+    src = pkgs.fetchPypi {
+      pname = "paperless-ghostnode";
+      version = "0.1.0";
+      sha256 = "sha256-...";
+    };
+    format = "pyproject";
+    nativeBuildInputs = [ pkgs.python3Packages.hatchling ];
+  };
+in {
+  systemd.services.paperless-task-queue.environment = {
+    PAPERLESS_APPS = "paperless_ghostnode";
+    PYTHONPATH = "${paperless-ghostnode}/${pkgs.python3.sitePackages}";
+    PAPERLESS_GHOSTNODE_HOSTS = "192.168.x.x:8080";
+  };
+}
+```
+## Configuration
+- `PAPERLESS_APPS=paperless_ghostnode`: The plugin needs to be registered as an app
+- `PAPERLESS_GHOSTNODE_HOSTS`: comma-separated list of `host:port` entries, tried in order. Should point to addresses which may host an active GhostNode app
+- `PAPERLESS_GHOSTNODE_MODE`: `skip` (default), `force`, or `all`
+## Why a parser and not a pre-consume script
+Paperless supports [pre-consumption scripts](https://docs.paperless-ngx.com/advanced_usage/#pre-consume-script) that run before a document is processed.
+Instead of the paperless-ghostnode plugin the pre-consumption-script can be used to OCR documents with GhostNode.
+The main problem with that is that the pre-consumption-script is not considered "processing" by paperless.
+I have not tested what happens, if the "reprocess" of a document is manually triggered from within paperless, when the pre-consumption-script is used. I would guess that either the pre-consumption-script is not rerun which would defeat the purpose of "reprocessing" or it is rerun, which would mean that the script is rerun on an already processed PDF as (we remember) the original is lost with a pre-consumption-script.
+If despite these drawbacks you want to still use it, find the script below
+### Pre-Consumption-Script
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+GHOSTNODE_URL="${GHOSTNODE_URL:-http://<ghostnode-ip>:8080}"
+case "$DOCUMENT_WORKING_PATH" in
+  *.pdf | *.PDF) ;;
+  *)
+    echo "ghostnode-ocr: not a PDF, skipping"
+    exit 0
+    ;;
+esac
+tmp=$(mktemp /tmp/ghostnode-ocr-XXXXXX.pdf)
+trap 'rm -f "$tmp"' EXIT
+echo "ghostnode-ocr: sending to GhostNode…"
+if ! curl \
+  --silent \
+  --fail \
+  --output "$tmp" \
+  --max-time 300 \
+  --request POST \
+  --header "Content-Type: application/pdf" \
+  --data-binary "@$DOCUMENT_WORKING_PATH" \
+  "$GHOSTNODE_URL/api/v1/ocr?mode=skip"; then
+  echo "ghostnode-ocr: GhostNode unreachable or returned error"
+  exit 1
+fi
+cp "$tmp" "$DOCUMENT_WORKING_PATH"
+echo "ghostnode-ocr: done"
+```
+Point Paperless at it via `PAPERLESS_PRE_CONSUME_SCRIPT` and set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` so Paperless doesn't re-OCR the result with Tesseract afterwards.

paperless_ghostnode-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+paperless_ghostnode/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+paperless_ghostnode/apps.py,sha256=kz-YNi6HpI0z57aNIzHJGBcFZSfMLLzFrRktO61ncpg,383
+paperless_ghostnode/parsers.py,sha256=EJjV6w1FvpA5Px0OMS7WbOuNwqAMVmO68pFuBBwF2ms,2111
+paperless_ghostnode/signals.py,sha256=Eh72vFWEVPPbhYxkslVpAckH8Xcs_9ji71pFqSy6Spw,657
+paperless_ghostnode-0.1.0.dist-info/METADATA,sha256=-4dKiYXgSzARnZzNqwgX0S6MMxobZ4FJYBlvgX_7KIg,4691
+paperless_ghostnode-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+paperless_ghostnode-0.1.0.dist-info/RECORD,,

paperless_ghostnode-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any