paperless-ghostnode 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ .DS_Store
2
+ *.swp
3
+ *~.nib
4
+ DerivedData/
5
+ build/
6
+ .build/
7
+ *.xcodeproj/project.xcworkspace/xcuserdata/
8
+ *.xcodeproj/xcuserdata/
9
+ *.pbxuser
10
+ *.mode1v3
11
+ *.mode2v3
12
+ *.perspectivev3
13
+ !default.pbxuser
14
+ !default.mode1v3
15
+ !default.mode2v3
16
+ !default.perspectivev3
17
+ xcuserdata
18
+ !xcshareddata
19
+ !xcschemes
20
+ *.moved-aside
21
+ /Pods
22
+ /Carthage
23
+
24
+ .swiftpm
25
+ /node_modules
26
+ /tmp
27
+
28
+ .env
29
+ .env.*
30
+
31
+ **/fastlane/report.xml
32
+ **/fastlane/Preview.html
33
+ **/fastlane/screenshots
34
+ **/fastlane/test_output
35
+
36
+ .direnv/
37
+
38
+ dist/
39
+ __pycache__/
40
+ *.pyc
41
+ .python-version
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: paperless-ghostnode
3
+ Version: 0.1.0
4
+ Summary: Paperless-ngx parser that routes OCR through GhostNode instead of Tesseract
5
+ Project-URL: Homepage, https://github.com/sevenautumns/GhostNode
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+
10
+ # paperless-ghostnode
11
+
12
+ paperless-ghostnode is a paperless plugin for routing OCR through GhostNode instead of Tesseract.
13
+ The plugin registers a consumer which does a simple HTTP request to a number of GhostNodes.
14
+ Should all GhostNodes fail, the processing fails.
15
+
16
+ A failed processing is not too bad. It keeps the failed files in the consume-folder where they can be "retried" (unfortunately, there is no retry button, but a restart of paperless re-scans the folder and "re-runs" them).
17
+
18
+ ## Installation
19
+
20
+ Depending on your
21
+
22
+ ### pip
23
+
24
+ plain `pip install` of paperless-ghostnode
25
+
26
+ ```
27
+ pip install paperless-ghostnode
28
+ ```
29
+
30
+ ### Docker Compose
31
+
32
+ When using docker compose, we need to somehow give paperless access to the files of paperless-ghostnode.
33
+ One of the easiest one is to have an init-service, which installs the plugin into a shared volume via pip and then mount that volume into the paperless-worker.
34
+
35
+ ```yaml
36
+ services:
37
+ paperless-ghostnode-install:
38
+ image: python:slim
39
+ command: ["pip", "install", "--target", "/plugins", "paperless-ghostnode"]
40
+ volumes:
41
+ - paperless-ghostnode:/plugins
42
+
43
+ paperless-worker:
44
+ # ...
45
+ depends_on:
46
+ paperless-ghostnode-install:
47
+ condition: service_completed_successfully
48
+ volumes:
49
+ - paperless-ghostnode:/plugins
50
+ environment:
51
+ PYTHONPATH: /plugins
52
+ PAPERLESS_APPS: paperless_ghostnode
53
+ PAPERLESS_GHOSTNODE_HOSTS: "192.168.x.x:8080"
54
+
55
+ volumes:
56
+ paperless-ghostnode: {}
57
+ ```
58
+
59
+ ### Nix
60
+
61
+ When using nix, we can just load the plugin from PyPi, build it, and then directly attach it to the `PYTHONPATH` of the `paperless-task-queue` service.
62
+
63
+ ```nix
64
+ let
65
+ paperless-ghostnode = pkgs.python3Packages.buildPythonPackage {
66
+ pname = "paperless-ghostnode";
67
+ version = "0.1.0";
68
+ src = pkgs.fetchPypi {
69
+ pname = "paperless-ghostnode";
70
+ version = "0.1.0";
71
+ sha256 = "sha256-...";
72
+ };
73
+ format = "pyproject";
74
+ nativeBuildInputs = [ pkgs.python3Packages.hatchling ];
75
+ };
76
+ in {
77
+ systemd.services.paperless-task-queue.environment = {
78
+ PAPERLESS_APPS = "paperless_ghostnode";
79
+ PYTHONPATH = "${paperless-ghostnode}/${pkgs.python3.sitePackages}";
80
+ PAPERLESS_GHOSTNODE_HOSTS = "192.168.x.x:8080";
81
+ };
82
+ }
83
+ ```
84
+
85
+ ## Configuration
86
+
87
+ - `PAPERLESS_APPS=paperless_ghostnode`: The plugin needs to be registered as an app
88
+ - `PAPERLESS_GHOSTNODE_HOSTS`: comma-separated list of `host:port` entries, tried in order. Should point to addresses which may host an active GhostNode app
89
+ - `PAPERLESS_GHOSTNODE_MODE`: `skip` (default), `force`, or `all`
90
+
91
+ ## Why a parser and not a pre-consume script
92
+
93
+ Paperless supports [pre-consumption scripts](https://docs.paperless-ngx.com/advanced_usage/#pre-consume-script) that run before a document is processed.
94
+
95
+ Instead of the paperless-ghostnode plugin the pre-consumption-script can be used to OCR documents with GhostNode.
96
+ The main problem with that is that the pre-consumption-script is not considered "processing" by paperless.
97
+
98
+ I have not tested what happens, if the "reprocess" of a document is manually triggered from within paperless, when the pre-consumption-script is used. I would guess that either the pre-consumption-script is not rerun which would defeat the purpose of "reprocessing" or it is rerun, which would mean that the script is rerun on an already processed PDF as (we remember) the original is lost with a pre-consumption-script.
99
+
100
+ If despite these drawbacks you want to still use it, find the script below
101
+
102
+ ### Pre-Consumption-Script
103
+
104
+ ```bash
105
+ #!/usr/bin/env bash
106
+ set -euo pipefail
107
+
108
+ GHOSTNODE_URL="${GHOSTNODE_URL:-http://<ghostnode-ip>:8080}"
109
+
110
+ case "$DOCUMENT_WORKING_PATH" in
111
+ *.pdf | *.PDF) ;;
112
+ *)
113
+ echo "ghostnode-ocr: not a PDF, skipping"
114
+ exit 0
115
+ ;;
116
+ esac
117
+
118
+ tmp=$(mktemp /tmp/ghostnode-ocr-XXXXXX.pdf)
119
+ trap 'rm -f "$tmp"' EXIT
120
+
121
+ echo "ghostnode-ocr: sending to GhostNode…"
122
+ if ! curl \
123
+ --silent \
124
+ --fail \
125
+ --output "$tmp" \
126
+ --max-time 300 \
127
+ --request POST \
128
+ --header "Content-Type: application/pdf" \
129
+ --data-binary "@$DOCUMENT_WORKING_PATH" \
130
+ "$GHOSTNODE_URL/api/v1/ocr?mode=skip"; then
131
+ echo "ghostnode-ocr: GhostNode unreachable or returned error"
132
+ exit 1
133
+ fi
134
+
135
+ cp "$tmp" "$DOCUMENT_WORKING_PATH"
136
+ echo "ghostnode-ocr: done"
137
+ ```
138
+
139
+ Point Paperless at it via `PAPERLESS_PRE_CONSUME_SCRIPT` and set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` so Paperless doesn't re-OCR the result with Tesseract afterwards.
@@ -0,0 +1,130 @@
1
+ # paperless-ghostnode
2
+
3
+ paperless-ghostnode is a paperless plugin for routing OCR through GhostNode instead of Tesseract.
4
+ The plugin registers a consumer which does a simple HTTP request to a number of GhostNodes.
5
+ Should all GhostNodes fail, the processing fails.
6
+
7
+ A failed processing is not too bad. It keeps the failed files in the consume-folder where they can be "retried" (unfortunately, there is no retry button, but a restart of paperless re-scans the folder and "re-runs" them).
8
+
9
+ ## Installation
10
+
11
+ Depending on your
12
+
13
+ ### pip
14
+
15
+ plain `pip install` of paperless-ghostnode
16
+
17
+ ```
18
+ pip install paperless-ghostnode
19
+ ```
20
+
21
+ ### Docker Compose
22
+
23
+ When using docker compose, we need to somehow give paperless access to the files of paperless-ghostnode.
24
+ One of the easiest one is to have an init-service, which installs the plugin into a shared volume via pip and then mount that volume into the paperless-worker.
25
+
26
+ ```yaml
27
+ services:
28
+ paperless-ghostnode-install:
29
+ image: python:slim
30
+ command: ["pip", "install", "--target", "/plugins", "paperless-ghostnode"]
31
+ volumes:
32
+ - paperless-ghostnode:/plugins
33
+
34
+ paperless-worker:
35
+ # ...
36
+ depends_on:
37
+ paperless-ghostnode-install:
38
+ condition: service_completed_successfully
39
+ volumes:
40
+ - paperless-ghostnode:/plugins
41
+ environment:
42
+ PYTHONPATH: /plugins
43
+ PAPERLESS_APPS: paperless_ghostnode
44
+ PAPERLESS_GHOSTNODE_HOSTS: "192.168.x.x:8080"
45
+
46
+ volumes:
47
+ paperless-ghostnode: {}
48
+ ```
49
+
50
+ ### Nix
51
+
52
+ When using nix, we can just load the plugin from PyPi, build it, and then directly attach it to the `PYTHONPATH` of the `paperless-task-queue` service.
53
+
54
+ ```nix
55
+ let
56
+ paperless-ghostnode = pkgs.python3Packages.buildPythonPackage {
57
+ pname = "paperless-ghostnode";
58
+ version = "0.1.0";
59
+ src = pkgs.fetchPypi {
60
+ pname = "paperless-ghostnode";
61
+ version = "0.1.0";
62
+ sha256 = "sha256-...";
63
+ };
64
+ format = "pyproject";
65
+ nativeBuildInputs = [ pkgs.python3Packages.hatchling ];
66
+ };
67
+ in {
68
+ systemd.services.paperless-task-queue.environment = {
69
+ PAPERLESS_APPS = "paperless_ghostnode";
70
+ PYTHONPATH = "${paperless-ghostnode}/${pkgs.python3.sitePackages}";
71
+ PAPERLESS_GHOSTNODE_HOSTS = "192.168.x.x:8080";
72
+ };
73
+ }
74
+ ```
75
+
76
+ ## Configuration
77
+
78
+ - `PAPERLESS_APPS=paperless_ghostnode`: The plugin needs to be registered as an app
79
+ - `PAPERLESS_GHOSTNODE_HOSTS`: comma-separated list of `host:port` entries, tried in order. Should point to addresses which may host an active GhostNode app
80
+ - `PAPERLESS_GHOSTNODE_MODE`: `skip` (default), `force`, or `all`
81
+
82
+ ## Why a parser and not a pre-consume script
83
+
84
+ Paperless supports [pre-consumption scripts](https://docs.paperless-ngx.com/advanced_usage/#pre-consume-script) that run before a document is processed.
85
+
86
+ Instead of the paperless-ghostnode plugin the pre-consumption-script can be used to OCR documents with GhostNode.
87
+ The main problem with that is that the pre-consumption-script is not considered "processing" by paperless.
88
+
89
+ I have not tested what happens, if the "reprocess" of a document is manually triggered from within paperless, when the pre-consumption-script is used. I would guess that either the pre-consumption-script is not rerun which would defeat the purpose of "reprocessing" or it is rerun, which would mean that the script is rerun on an already processed PDF as (we remember) the original is lost with a pre-consumption-script.
90
+
91
+ If despite these drawbacks you want to still use it, find the script below
92
+
93
+ ### Pre-Consumption-Script
94
+
95
+ ```bash
96
+ #!/usr/bin/env bash
97
+ set -euo pipefail
98
+
99
+ GHOSTNODE_URL="${GHOSTNODE_URL:-http://<ghostnode-ip>:8080}"
100
+
101
+ case "$DOCUMENT_WORKING_PATH" in
102
+ *.pdf | *.PDF) ;;
103
+ *)
104
+ echo "ghostnode-ocr: not a PDF, skipping"
105
+ exit 0
106
+ ;;
107
+ esac
108
+
109
+ tmp=$(mktemp /tmp/ghostnode-ocr-XXXXXX.pdf)
110
+ trap 'rm -f "$tmp"' EXIT
111
+
112
+ echo "ghostnode-ocr: sending to GhostNode…"
113
+ if ! curl \
114
+ --silent \
115
+ --fail \
116
+ --output "$tmp" \
117
+ --max-time 300 \
118
+ --request POST \
119
+ --header "Content-Type: application/pdf" \
120
+ --data-binary "@$DOCUMENT_WORKING_PATH" \
121
+ "$GHOSTNODE_URL/api/v1/ocr?mode=skip"; then
122
+ echo "ghostnode-ocr: GhostNode unreachable or returned error"
123
+ exit 1
124
+ fi
125
+
126
+ cp "$tmp" "$DOCUMENT_WORKING_PATH"
127
+ echo "ghostnode-ocr: done"
128
+ ```
129
+
130
+ Point Paperless at it via `PAPERLESS_PRE_CONSUME_SCRIPT` and set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` so Paperless doesn't re-OCR the result with Tesseract afterwards.
@@ -0,0 +1,14 @@
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class GhostNodeConfig(AppConfig):
5
+ name = "paperless_ghostnode"
6
+
7
+ def ready(self):
8
+ from documents.signals import document_consumer_declaration
9
+
10
+ from paperless_ghostnode.signals import ghostnode_consumer_declaration
11
+
12
+ document_consumer_declaration.connect(ghostnode_consumer_declaration)
13
+
14
+ AppConfig.ready(self)
@@ -0,0 +1,57 @@
1
+ import os
2
+ import urllib.parse
3
+ import urllib.request
4
+ from pathlib import Path
5
+
6
+ from documents.parsers import ParseError
7
+ from paperless_tesseract.parsers import RasterisedDocumentParser
8
+
9
+ DEFAULT_MODE = "skip"
10
+
11
+
12
+ class GhostNodeDocumentParser(RasterisedDocumentParser):
13
+ """Routes OCR through GhostNode instead of Tesseract."""
14
+
15
+ logging_name = "paperless.parsing.ghostnode"
16
+
17
+ def parse(self, document_path, mime_type, file_name=None):
18
+ hosts = [
19
+ host.strip()
20
+ for host in os.environ.get("PAPERLESS_GHOSTNODE_HOSTS", "").split(",")
21
+ if host.strip()
22
+ ]
23
+ if not hosts:
24
+ raise ParseError(
25
+ "PAPERLESS_GHOSTNODE_HOSTS is unset or empty, set it to a comma-separated list of host:port entries.",
26
+ )
27
+ mode = os.environ.get("PAPERLESS_GHOSTNODE_MODE", DEFAULT_MODE)
28
+
29
+ archive_path = Path(self.tempdir) / "archive.pdf"
30
+ data = Path(document_path).read_bytes()
31
+ query = urllib.parse.urlencode({"mode": mode})
32
+
33
+ last_error = None
34
+ for host in hosts:
35
+ url = f"http://{host}/api/v1/ocr?{query}"
36
+ try:
37
+ self.log.debug(f"Sending document to GhostNode at {host} (mode={mode})")
38
+ request = urllib.request.Request(
39
+ url,
40
+ data=data,
41
+ headers={"Content-Type": mime_type},
42
+ method="POST",
43
+ )
44
+ # urlopen raises HTTPError on non-2xx
45
+ with urllib.request.urlopen(request, timeout=300) as response:
46
+ archive_path.write_bytes(response.read())
47
+ self.archive_path = archive_path
48
+ self.text = self.extract_text(None, archive_path)
49
+ self.log.debug(f"GhostNode OCR done via {host}")
50
+ return
51
+ except Exception as e:
52
+ last_error = e
53
+ self.log.warning(f"GhostNode host {host} failed: {e}")
54
+
55
+ raise ParseError(
56
+ f"GhostNode OCR failed on all hosts ({', '.join(hosts)}): {last_error}",
57
+ )
@@ -0,0 +1,22 @@
1
+ def get_parser(*args, **kwargs):
2
+ from paperless_ghostnode.parsers import GhostNodeDocumentParser
3
+
4
+ return GhostNodeDocumentParser(*args, **kwargs)
5
+
6
+
7
+ def ghostnode_consumer_declaration(sender, **kwargs):
8
+ # weight 1 > Tesseract's 0, so GhostNode wins for these types
9
+ return {
10
+ "parser": get_parser,
11
+ "weight": 1,
12
+ "mime_types": {
13
+ "application/pdf": ".pdf",
14
+ "image/jpeg": ".jpg",
15
+ "image/png": ".png",
16
+ "image/tiff": ".tif",
17
+ "image/gif": ".gif",
18
+ "image/bmp": ".bmp",
19
+ "image/webp": ".webp",
20
+ "image/heic": ".heic",
21
+ },
22
+ }
@@ -0,0 +1,14 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "paperless-ghostnode"
7
+ version = "0.1.0"
8
+ description = "Paperless-ngx parser that routes OCR through GhostNode instead of Tesseract"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "Apache-2.0" }
12
+
13
+ [project.urls]
14
+ Homepage = "https://github.com/sevenautumns/GhostNode"