paperless-ghostnode 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,14 @@
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class GhostNodeConfig(AppConfig):
5
+ name = "paperless_ghostnode"
6
+
7
+ def ready(self):
8
+ from documents.signals import document_consumer_declaration
9
+
10
+ from paperless_ghostnode.signals import ghostnode_consumer_declaration
11
+
12
+ document_consumer_declaration.connect(ghostnode_consumer_declaration)
13
+
14
+ AppConfig.ready(self)
@@ -0,0 +1,57 @@
1
+ import os
2
+ import urllib.parse
3
+ import urllib.request
4
+ from pathlib import Path
5
+
6
+ from documents.parsers import ParseError
7
+ from paperless_tesseract.parsers import RasterisedDocumentParser
8
+
9
+ DEFAULT_MODE = "skip"
10
+
11
+
12
+ class GhostNodeDocumentParser(RasterisedDocumentParser):
13
+ """Routes OCR through GhostNode instead of Tesseract."""
14
+
15
+ logging_name = "paperless.parsing.ghostnode"
16
+
17
+ def parse(self, document_path, mime_type, file_name=None):
18
+ hosts = [
19
+ host.strip()
20
+ for host in os.environ.get("PAPERLESS_GHOSTNODE_HOSTS", "").split(",")
21
+ if host.strip()
22
+ ]
23
+ if not hosts:
24
+ raise ParseError(
25
+ "PAPERLESS_GHOSTNODE_HOSTS is unset or empty, set it to a comma-separated list of host:port entries.",
26
+ )
27
+ mode = os.environ.get("PAPERLESS_GHOSTNODE_MODE", DEFAULT_MODE)
28
+
29
+ archive_path = Path(self.tempdir) / "archive.pdf"
30
+ data = Path(document_path).read_bytes()
31
+ query = urllib.parse.urlencode({"mode": mode})
32
+
33
+ last_error = None
34
+ for host in hosts:
35
+ url = f"http://{host}/api/v1/ocr?{query}"
36
+ try:
37
+ self.log.debug(f"Sending document to GhostNode at {host} (mode={mode})")
38
+ request = urllib.request.Request(
39
+ url,
40
+ data=data,
41
+ headers={"Content-Type": mime_type},
42
+ method="POST",
43
+ )
44
+ # urlopen raises HTTPError on non-2xx
45
+ with urllib.request.urlopen(request, timeout=300) as response:
46
+ archive_path.write_bytes(response.read())
47
+ self.archive_path = archive_path
48
+ self.text = self.extract_text(None, archive_path)
49
+ self.log.debug(f"GhostNode OCR done via {host}")
50
+ return
51
+ except Exception as e:
52
+ last_error = e
53
+ self.log.warning(f"GhostNode host {host} failed: {e}")
54
+
55
+ raise ParseError(
56
+ f"GhostNode OCR failed on all hosts ({', '.join(hosts)}): {last_error}",
57
+ )
@@ -0,0 +1,22 @@
1
+ def get_parser(*args, **kwargs):
2
+ from paperless_ghostnode.parsers import GhostNodeDocumentParser
3
+
4
+ return GhostNodeDocumentParser(*args, **kwargs)
5
+
6
+
7
+ def ghostnode_consumer_declaration(sender, **kwargs):
8
+ # weight 1 > Tesseract's 0, so GhostNode wins for these types
9
+ return {
10
+ "parser": get_parser,
11
+ "weight": 1,
12
+ "mime_types": {
13
+ "application/pdf": ".pdf",
14
+ "image/jpeg": ".jpg",
15
+ "image/png": ".png",
16
+ "image/tiff": ".tif",
17
+ "image/gif": ".gif",
18
+ "image/bmp": ".bmp",
19
+ "image/webp": ".webp",
20
+ "image/heic": ".heic",
21
+ },
22
+ }
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: paperless-ghostnode
3
+ Version: 0.1.0
4
+ Summary: Paperless-ngx parser that routes OCR through GhostNode instead of Tesseract
5
+ Project-URL: Homepage, https://github.com/sevenautumns/GhostNode
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+
10
+ # paperless-ghostnode
11
+
12
+ paperless-ghostnode is a paperless plugin for routing OCR through GhostNode instead of Tesseract.
13
+ The plugin registers a consumer which does a simple HTTP request to a number of GhostNodes.
14
+ Should all GhostNodes fail, the processing fails.
15
+
16
+ A failed processing is not too bad. It keeps the failed files in the consume-folder where they can be "retried" (unfortunately, there is no retry button, but a restart of paperless re-scans the folder and "re-runs" them).
17
+
18
+ ## Installation
19
+
20
+ Depending on your
21
+
22
+ ### pip
23
+
24
+ plain `pip install` of paperless-ghostnode
25
+
26
+ ```
27
+ pip install paperless-ghostnode
28
+ ```
29
+
30
+ ### Docker Compose
31
+
32
+ When using docker compose, we need to somehow give paperless access to the files of paperless-ghostnode.
33
+ One of the easiest one is to have an init-service, which installs the plugin into a shared volume via pip and then mount that volume into the paperless-worker.
34
+
35
+ ```yaml
36
+ services:
37
+ paperless-ghostnode-install:
38
+ image: python:slim
39
+ command: ["pip", "install", "--target", "/plugins", "paperless-ghostnode"]
40
+ volumes:
41
+ - paperless-ghostnode:/plugins
42
+
43
+ paperless-worker:
44
+ # ...
45
+ depends_on:
46
+ paperless-ghostnode-install:
47
+ condition: service_completed_successfully
48
+ volumes:
49
+ - paperless-ghostnode:/plugins
50
+ environment:
51
+ PYTHONPATH: /plugins
52
+ PAPERLESS_APPS: paperless_ghostnode
53
+ PAPERLESS_GHOSTNODE_HOSTS: "192.168.x.x:8080"
54
+
55
+ volumes:
56
+ paperless-ghostnode: {}
57
+ ```
58
+
59
+ ### Nix
60
+
61
+ When using nix, we can just load the plugin from PyPi, build it, and then directly attach it to the `PYTHONPATH` of the `paperless-task-queue` service.
62
+
63
+ ```nix
64
+ let
65
+ paperless-ghostnode = pkgs.python3Packages.buildPythonPackage {
66
+ pname = "paperless-ghostnode";
67
+ version = "0.1.0";
68
+ src = pkgs.fetchPypi {
69
+ pname = "paperless-ghostnode";
70
+ version = "0.1.0";
71
+ sha256 = "sha256-...";
72
+ };
73
+ format = "pyproject";
74
+ nativeBuildInputs = [ pkgs.python3Packages.hatchling ];
75
+ };
76
+ in {
77
+ systemd.services.paperless-task-queue.environment = {
78
+ PAPERLESS_APPS = "paperless_ghostnode";
79
+ PYTHONPATH = "${paperless-ghostnode}/${pkgs.python3.sitePackages}";
80
+ PAPERLESS_GHOSTNODE_HOSTS = "192.168.x.x:8080";
81
+ };
82
+ }
83
+ ```
84
+
85
+ ## Configuration
86
+
87
+ - `PAPERLESS_APPS=paperless_ghostnode`: The plugin needs to be registered as an app
88
+ - `PAPERLESS_GHOSTNODE_HOSTS`: comma-separated list of `host:port` entries, tried in order. Should point to addresses which may host an active GhostNode app
89
+ - `PAPERLESS_GHOSTNODE_MODE`: `skip` (default), `force`, or `all`
90
+
91
+ ## Why a parser and not a pre-consume script
92
+
93
+ Paperless supports [pre-consumption scripts](https://docs.paperless-ngx.com/advanced_usage/#pre-consume-script) that run before a document is processed.
94
+
95
+ Instead of the paperless-ghostnode plugin the pre-consumption-script can be used to OCR documents with GhostNode.
96
+ The main problem with that is that the pre-consumption-script is not considered "processing" by paperless.
97
+
98
+ I have not tested what happens, if the "reprocess" of a document is manually triggered from within paperless, when the pre-consumption-script is used. I would guess that either the pre-consumption-script is not rerun which would defeat the purpose of "reprocessing" or it is rerun, which would mean that the script is rerun on an already processed PDF as (we remember) the original is lost with a pre-consumption-script.
99
+
100
+ If despite these drawbacks you want to still use it, find the script below
101
+
102
+ ### Pre-Consumption-Script
103
+
104
+ ```bash
105
+ #!/usr/bin/env bash
106
+ set -euo pipefail
107
+
108
+ GHOSTNODE_URL="${GHOSTNODE_URL:-http://<ghostnode-ip>:8080}"
109
+
110
+ case "$DOCUMENT_WORKING_PATH" in
111
+ *.pdf | *.PDF) ;;
112
+ *)
113
+ echo "ghostnode-ocr: not a PDF, skipping"
114
+ exit 0
115
+ ;;
116
+ esac
117
+
118
+ tmp=$(mktemp /tmp/ghostnode-ocr-XXXXXX.pdf)
119
+ trap 'rm -f "$tmp"' EXIT
120
+
121
+ echo "ghostnode-ocr: sending to GhostNode…"
122
+ if ! curl \
123
+ --silent \
124
+ --fail \
125
+ --output "$tmp" \
126
+ --max-time 300 \
127
+ --request POST \
128
+ --header "Content-Type: application/pdf" \
129
+ --data-binary "@$DOCUMENT_WORKING_PATH" \
130
+ "$GHOSTNODE_URL/api/v1/ocr?mode=skip"; then
131
+ echo "ghostnode-ocr: GhostNode unreachable or returned error"
132
+ exit 1
133
+ fi
134
+
135
+ cp "$tmp" "$DOCUMENT_WORKING_PATH"
136
+ echo "ghostnode-ocr: done"
137
+ ```
138
+
139
+ Point Paperless at it via `PAPERLESS_PRE_CONSUME_SCRIPT` and set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` so Paperless doesn't re-OCR the result with Tesseract afterwards.
@@ -0,0 +1,7 @@
1
+ paperless_ghostnode/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ paperless_ghostnode/apps.py,sha256=kz-YNi6HpI0z57aNIzHJGBcFZSfMLLzFrRktO61ncpg,383
3
+ paperless_ghostnode/parsers.py,sha256=EJjV6w1FvpA5Px0OMS7WbOuNwqAMVmO68pFuBBwF2ms,2111
4
+ paperless_ghostnode/signals.py,sha256=Eh72vFWEVPPbhYxkslVpAckH8Xcs_9ji71pFqSy6Spw,657
5
+ paperless_ghostnode-0.1.0.dist-info/METADATA,sha256=-4dKiYXgSzARnZzNqwgX0S6MMxobZ4FJYBlvgX_7KIg,4691
6
+ paperless_ghostnode-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
7
+ paperless_ghostnode-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any