trustgraph-ocr 0.21.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trustgraph_ocr-0.21.11/PKG-INFO +21 -0
- trustgraph_ocr-0.21.11/README.md +1 -0
- trustgraph_ocr-0.21.11/scripts/pdf-ocr +6 -0
- trustgraph_ocr-0.21.11/setup.cfg +4 -0
- trustgraph_ocr-0.21.11/setup.py +47 -0
- trustgraph_ocr-0.21.11/trustgraph/decoding/ocr/__init__.py +3 -0
- trustgraph_ocr-0.21.11/trustgraph/decoding/ocr/__main__.py +7 -0
- trustgraph_ocr-0.21.11/trustgraph/decoding/ocr/pdf_decoder.py +83 -0
- trustgraph_ocr-0.21.11/trustgraph/ocr_version.py +1 -0
- trustgraph_ocr-0.21.11/trustgraph/tesseract_version.py +1 -0
- trustgraph_ocr-0.21.11/trustgraph_ocr.egg-info/PKG-INFO +21 -0
- trustgraph_ocr-0.21.11/trustgraph_ocr.egg-info/SOURCES.txt +13 -0
- trustgraph_ocr-0.21.11/trustgraph_ocr.egg-info/dependency_links.txt +1 -0
- trustgraph_ocr-0.21.11/trustgraph_ocr.egg-info/requires.txt +6 -0
- trustgraph_ocr-0.21.11/trustgraph_ocr.egg-info/top_level.txt +2 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: trustgraph-ocr
|
3
|
+
Version: 0.21.11
|
4
|
+
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
5
|
+
Home-page: https://github.com/trustgraph-ai/trustgraph
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.21.11.tar.gz
|
7
|
+
Author: trustgraph.ai
|
8
|
+
Author-email: security@trustgraph.ai
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.8
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
Requires-Dist: trustgraph-base<0.22,>=0.21
|
15
|
+
Requires-Dist: pulsar-client
|
16
|
+
Requires-Dist: prometheus-client
|
17
|
+
Requires-Dist: boto3
|
18
|
+
Requires-Dist: pdf2image
|
19
|
+
Requires-Dist: pytesseract
|
20
|
+
|
21
|
+
See https://trustgraph.ai/
|
@@ -0,0 +1 @@
|
|
1
|
+
See https://trustgraph.ai/
|
@@ -0,0 +1,47 @@
|
|
1
|
+
import setuptools
|
2
|
+
import os
|
3
|
+
import importlib
|
4
|
+
|
5
|
+
with open("README.md", "r") as fh:
|
6
|
+
long_description = fh.read()
|
7
|
+
|
8
|
+
# Load a version number module
|
9
|
+
spec = importlib.util.spec_from_file_location(
|
10
|
+
'version', 'trustgraph/ocr_version.py'
|
11
|
+
)
|
12
|
+
version_module = importlib.util.module_from_spec(spec)
|
13
|
+
spec.loader.exec_module(version_module)
|
14
|
+
|
15
|
+
version = version_module.__version__
|
16
|
+
|
17
|
+
setuptools.setup(
|
18
|
+
name="trustgraph-ocr",
|
19
|
+
version=version,
|
20
|
+
author="trustgraph.ai",
|
21
|
+
author_email="security@trustgraph.ai",
|
22
|
+
description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.",
|
23
|
+
long_description=long_description,
|
24
|
+
long_description_content_type="text/markdown",
|
25
|
+
url="https://github.com/trustgraph-ai/trustgraph",
|
26
|
+
packages=setuptools.find_namespace_packages(
|
27
|
+
where='./',
|
28
|
+
),
|
29
|
+
classifiers=[
|
30
|
+
"Programming Language :: Python :: 3",
|
31
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
32
|
+
"Operating System :: OS Independent",
|
33
|
+
],
|
34
|
+
python_requires='>=3.8',
|
35
|
+
download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz",
|
36
|
+
install_requires=[
|
37
|
+
"trustgraph-base>=0.21,<0.22",
|
38
|
+
"pulsar-client",
|
39
|
+
"prometheus-client",
|
40
|
+
"boto3",
|
41
|
+
"pdf2image",
|
42
|
+
"pytesseract",
|
43
|
+
],
|
44
|
+
scripts=[
|
45
|
+
"scripts/pdf-ocr",
|
46
|
+
]
|
47
|
+
)
|
@@ -0,0 +1,83 @@
|
|
1
|
+
|
2
|
+
"""
|
3
|
+
Simple decoder, accepts PDF documents on input, outputs pages from the
|
4
|
+
PDF document as text as separate output objects.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import tempfile
|
8
|
+
import base64
|
9
|
+
import pytesseract
|
10
|
+
from pdf2image import convert_from_bytes
|
11
|
+
|
12
|
+
from ... schema import Document, TextDocument, Metadata
|
13
|
+
from ... schema import document_ingest_queue, text_ingest_queue
|
14
|
+
from ... log_level import LogLevel
|
15
|
+
from ... base import ConsumerProducer
|
16
|
+
|
17
|
+
module = ".".join(__name__.split(".")[1:-1])
|
18
|
+
|
19
|
+
default_input_queue = document_ingest_queue
|
20
|
+
default_output_queue = text_ingest_queue
|
21
|
+
default_subscriber = module
|
22
|
+
|
23
|
+
class Processor(ConsumerProducer):
|
24
|
+
|
25
|
+
def __init__(self, **params):
|
26
|
+
|
27
|
+
input_queue = params.get("input_queue", default_input_queue)
|
28
|
+
output_queue = params.get("output_queue", default_output_queue)
|
29
|
+
subscriber = params.get("subscriber", default_subscriber)
|
30
|
+
|
31
|
+
super(Processor, self).__init__(
|
32
|
+
**params | {
|
33
|
+
"input_queue": input_queue,
|
34
|
+
"output_queue": output_queue,
|
35
|
+
"subscriber": subscriber,
|
36
|
+
"input_schema": Document,
|
37
|
+
"output_schema": TextDocument,
|
38
|
+
}
|
39
|
+
)
|
40
|
+
|
41
|
+
print("PDF OCR inited")
|
42
|
+
|
43
|
+
async def handle(self, msg):
|
44
|
+
|
45
|
+
print("PDF message received")
|
46
|
+
|
47
|
+
v = msg.value()
|
48
|
+
|
49
|
+
print(f"Decoding {v.metadata.id}...", flush=True)
|
50
|
+
|
51
|
+
blob = base64.b64decode(v.data)
|
52
|
+
|
53
|
+
pages = convert_from_bytes(blob)
|
54
|
+
|
55
|
+
for ix, page in enumerate(pages):
|
56
|
+
|
57
|
+
try:
|
58
|
+
text = pytesseract.image_to_string(page, lang='eng')
|
59
|
+
except Exception as e:
|
60
|
+
print(f"Page did not OCR: {e}")
|
61
|
+
continue
|
62
|
+
|
63
|
+
r = TextDocument(
|
64
|
+
metadata=v.metadata,
|
65
|
+
text=text.encode("utf-8"),
|
66
|
+
)
|
67
|
+
|
68
|
+
await self.send(r)
|
69
|
+
|
70
|
+
print("Done.", flush=True)
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def add_args(parser):
|
74
|
+
|
75
|
+
ConsumerProducer.add_args(
|
76
|
+
parser, default_input_queue, default_subscriber,
|
77
|
+
default_output_queue,
|
78
|
+
)
|
79
|
+
|
80
|
+
def run():
|
81
|
+
|
82
|
+
Processor.launch(module, __doc__)
|
83
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.21.11"
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.0.0"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: trustgraph-ocr
|
3
|
+
Version: 0.21.11
|
4
|
+
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
5
|
+
Home-page: https://github.com/trustgraph-ai/trustgraph
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.21.11.tar.gz
|
7
|
+
Author: trustgraph.ai
|
8
|
+
Author-email: security@trustgraph.ai
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.8
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
Requires-Dist: trustgraph-base<0.22,>=0.21
|
15
|
+
Requires-Dist: pulsar-client
|
16
|
+
Requires-Dist: prometheus-client
|
17
|
+
Requires-Dist: boto3
|
18
|
+
Requires-Dist: pdf2image
|
19
|
+
Requires-Dist: pytesseract
|
20
|
+
|
21
|
+
See https://trustgraph.ai/
|
@@ -0,0 +1,13 @@
|
|
1
|
+
README.md
|
2
|
+
setup.py
|
3
|
+
scripts/pdf-ocr
|
4
|
+
trustgraph/ocr_version.py
|
5
|
+
trustgraph/tesseract_version.py
|
6
|
+
trustgraph/decoding/ocr/__init__.py
|
7
|
+
trustgraph/decoding/ocr/__main__.py
|
8
|
+
trustgraph/decoding/ocr/pdf_decoder.py
|
9
|
+
trustgraph_ocr.egg-info/PKG-INFO
|
10
|
+
trustgraph_ocr.egg-info/SOURCES.txt
|
11
|
+
trustgraph_ocr.egg-info/dependency_links.txt
|
12
|
+
trustgraph_ocr.egg-info/requires.txt
|
13
|
+
trustgraph_ocr.egg-info/top_level.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
|