trustgraph-ocr 0.21.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: trustgraph-ocr
3
+ Version: 0.21.11
4
+ Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
+ Home-page: https://github.com/trustgraph-ai/trustgraph
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.21.11.tar.gz
7
+ Author: trustgraph.ai
8
+ Author-email: security@trustgraph.ai
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: trustgraph-base<0.22,>=0.21
15
+ Requires-Dist: pulsar-client
16
+ Requires-Dist: prometheus-client
17
+ Requires-Dist: boto3
18
+ Requires-Dist: pdf2image
19
+ Requires-Dist: pytesseract
20
+
21
+ See https://trustgraph.ai/
@@ -0,0 +1 @@
1
+ See https://trustgraph.ai/
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from trustgraph.decoding.ocr import run
4
+
5
+ run()
6
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,47 @@
1
+ import setuptools
2
+ import os
3
+ import importlib
4
+
5
+ with open("README.md", "r") as fh:
6
+ long_description = fh.read()
7
+
8
+ # Load a version number module
9
+ spec = importlib.util.spec_from_file_location(
10
+ 'version', 'trustgraph/ocr_version.py'
11
+ )
12
+ version_module = importlib.util.module_from_spec(spec)
13
+ spec.loader.exec_module(version_module)
14
+
15
+ version = version_module.__version__
16
+
17
+ setuptools.setup(
18
+ name="trustgraph-ocr",
19
+ version=version,
20
+ author="trustgraph.ai",
21
+ author_email="security@trustgraph.ai",
22
+ description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.",
23
+ long_description=long_description,
24
+ long_description_content_type="text/markdown",
25
+ url="https://github.com/trustgraph-ai/trustgraph",
26
+ packages=setuptools.find_namespace_packages(
27
+ where='./',
28
+ ),
29
+ classifiers=[
30
+ "Programming Language :: Python :: 3",
31
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
32
+ "Operating System :: OS Independent",
33
+ ],
34
+ python_requires='>=3.8',
35
+ download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz",
36
+ install_requires=[
37
+ "trustgraph-base>=0.21,<0.22",
38
+ "pulsar-client",
39
+ "prometheus-client",
40
+ "boto3",
41
+ "pdf2image",
42
+ "pytesseract",
43
+ ],
44
+ scripts=[
45
+ "scripts/pdf-ocr",
46
+ ]
47
+ )
@@ -0,0 +1,3 @@
1
+
2
+ from . pdf_decoder import *
3
+
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from . pdf_decoder import run
4
+
5
+ if __name__ == '__main__':
6
+ run()
7
+
@@ -0,0 +1,83 @@
1
+
2
+ """
3
+ Simple decoder, accepts PDF documents on input, outputs pages from the
4
+ PDF document as text as separate output objects.
5
+ """
6
+
7
+ import tempfile
8
+ import base64
9
+ import pytesseract
10
+ from pdf2image import convert_from_bytes
11
+
12
+ from ... schema import Document, TextDocument, Metadata
13
+ from ... schema import document_ingest_queue, text_ingest_queue
14
+ from ... log_level import LogLevel
15
+ from ... base import ConsumerProducer
16
+
17
+ module = ".".join(__name__.split(".")[1:-1])
18
+
19
+ default_input_queue = document_ingest_queue
20
+ default_output_queue = text_ingest_queue
21
+ default_subscriber = module
22
+
23
+ class Processor(ConsumerProducer):
24
+
25
+ def __init__(self, **params):
26
+
27
+ input_queue = params.get("input_queue", default_input_queue)
28
+ output_queue = params.get("output_queue", default_output_queue)
29
+ subscriber = params.get("subscriber", default_subscriber)
30
+
31
+ super(Processor, self).__init__(
32
+ **params | {
33
+ "input_queue": input_queue,
34
+ "output_queue": output_queue,
35
+ "subscriber": subscriber,
36
+ "input_schema": Document,
37
+ "output_schema": TextDocument,
38
+ }
39
+ )
40
+
41
+ print("PDF OCR inited")
42
+
43
+ async def handle(self, msg):
44
+
45
+ print("PDF message received")
46
+
47
+ v = msg.value()
48
+
49
+ print(f"Decoding {v.metadata.id}...", flush=True)
50
+
51
+ blob = base64.b64decode(v.data)
52
+
53
+ pages = convert_from_bytes(blob)
54
+
55
+ for ix, page in enumerate(pages):
56
+
57
+ try:
58
+ text = pytesseract.image_to_string(page, lang='eng')
59
+ except Exception as e:
60
+ print(f"Page did not OCR: {e}")
61
+ continue
62
+
63
+ r = TextDocument(
64
+ metadata=v.metadata,
65
+ text=text.encode("utf-8"),
66
+ )
67
+
68
+ await self.send(r)
69
+
70
+ print("Done.", flush=True)
71
+
72
+ @staticmethod
73
+ def add_args(parser):
74
+
75
+ ConsumerProducer.add_args(
76
+ parser, default_input_queue, default_subscriber,
77
+ default_output_queue,
78
+ )
79
+
80
+ def run():
81
+
82
+ Processor.launch(module, __doc__)
83
+
@@ -0,0 +1 @@
1
+ __version__ = "0.21.11"
@@ -0,0 +1 @@
1
+ __version__ = "0.0.0"
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: trustgraph-ocr
3
+ Version: 0.21.11
4
+ Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
+ Home-page: https://github.com/trustgraph-ai/trustgraph
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.21.11.tar.gz
7
+ Author: trustgraph.ai
8
+ Author-email: security@trustgraph.ai
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: trustgraph-base<0.22,>=0.21
15
+ Requires-Dist: pulsar-client
16
+ Requires-Dist: prometheus-client
17
+ Requires-Dist: boto3
18
+ Requires-Dist: pdf2image
19
+ Requires-Dist: pytesseract
20
+
21
+ See https://trustgraph.ai/
@@ -0,0 +1,13 @@
1
+ README.md
2
+ setup.py
3
+ scripts/pdf-ocr
4
+ trustgraph/ocr_version.py
5
+ trustgraph/tesseract_version.py
6
+ trustgraph/decoding/ocr/__init__.py
7
+ trustgraph/decoding/ocr/__main__.py
8
+ trustgraph/decoding/ocr/pdf_decoder.py
9
+ trustgraph_ocr.egg-info/PKG-INFO
10
+ trustgraph_ocr.egg-info/SOURCES.txt
11
+ trustgraph_ocr.egg-info/dependency_links.txt
12
+ trustgraph_ocr.egg-info/requires.txt
13
+ trustgraph_ocr.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ trustgraph-base<0.22,>=0.21
2
+ pulsar-client
3
+ prometheus-client
4
+ boto3
5
+ pdf2image
6
+ pytesseract
@@ -0,0 +1,2 @@
1
+ scripts
2
+ trustgraph