trustgraph-ocr 1.2.0__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/PKG-INFO +10 -7
- trustgraph_ocr-1.2.4/pyproject.toml +35 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph/decoding/ocr/pdf_decoder.py +9 -5
- trustgraph_ocr-1.2.4/trustgraph/ocr_version.py +1 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph_ocr.egg-info/PKG-INFO +10 -7
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph_ocr.egg-info/SOURCES.txt +2 -2
- trustgraph_ocr-1.2.4/trustgraph_ocr.egg-info/entry_points.txt +2 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph_ocr.egg-info/top_level.txt +0 -1
- trustgraph-ocr-1.2.0/scripts/pdf-ocr +0 -6
- trustgraph-ocr-1.2.0/setup.py +0 -47
- trustgraph-ocr-1.2.0/trustgraph/ocr_version.py +0 -1
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/README.md +0 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/setup.cfg +0 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph/decoding/ocr/__init__.py +0 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph/decoding/ocr/__main__.py +0 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph_ocr.egg-info/dependency_links.txt +0 -0
- {trustgraph-ocr-1.2.0 → trustgraph_ocr-1.2.4}/trustgraph_ocr.egg-info/requires.txt +3 -3
@@ -1,15 +1,18 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: trustgraph-ocr
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.4
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
5
|
-
|
6
|
-
|
7
|
-
Author: trustgraph.ai
|
8
|
-
Author-email: security@trustgraph.ai
|
5
|
+
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
6
|
+
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
9
7
|
Classifier: Programming Language :: Python :: 3
|
10
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
11
8
|
Classifier: Operating System :: OS Independent
|
12
9
|
Requires-Python: >=3.8
|
13
10
|
Description-Content-Type: text/markdown
|
11
|
+
Requires-Dist: trustgraph-base<1.3,>=1.2
|
12
|
+
Requires-Dist: pulsar-client
|
13
|
+
Requires-Dist: prometheus-client
|
14
|
+
Requires-Dist: boto3
|
15
|
+
Requires-Dist: pdf2image
|
16
|
+
Requires-Dist: pytesseract
|
14
17
|
|
15
18
|
See https://trustgraph.ai/
|
@@ -0,0 +1,35 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "trustgraph-ocr"
|
7
|
+
dynamic = ["version"]
|
8
|
+
authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
|
9
|
+
description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
|
10
|
+
readme = "README.md"
|
11
|
+
requires-python = ">=3.8"
|
12
|
+
dependencies = [
|
13
|
+
"trustgraph-base>=1.2,<1.3",
|
14
|
+
"pulsar-client",
|
15
|
+
"prometheus-client",
|
16
|
+
"boto3",
|
17
|
+
"pdf2image",
|
18
|
+
"pytesseract",
|
19
|
+
]
|
20
|
+
classifiers = [
|
21
|
+
"Programming Language :: Python :: 3",
|
22
|
+
"Operating System :: OS Independent",
|
23
|
+
]
|
24
|
+
|
25
|
+
[project.urls]
|
26
|
+
Homepage = "https://github.com/trustgraph-ai/trustgraph"
|
27
|
+
|
28
|
+
[project.scripts]
|
29
|
+
pdf-ocr = "trustgraph.decoding.ocr:run"
|
30
|
+
|
31
|
+
[tool.setuptools.packages.find]
|
32
|
+
include = ["trustgraph*"]
|
33
|
+
|
34
|
+
[tool.setuptools.dynamic]
|
35
|
+
version = {attr = "trustgraph.ocr_version.__version__"}
|
@@ -6,12 +6,16 @@ PDF document as text as separate output objects.
|
|
6
6
|
|
7
7
|
import tempfile
|
8
8
|
import base64
|
9
|
+
import logging
|
9
10
|
import pytesseract
|
10
11
|
from pdf2image import convert_from_bytes
|
11
12
|
|
12
13
|
from ... schema import Document, TextDocument, Metadata
|
13
14
|
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
14
15
|
|
16
|
+
# Module logger
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
15
19
|
default_ident = "pdf-decoder"
|
16
20
|
|
17
21
|
class Processor(FlowProcessor):
|
@@ -41,15 +45,15 @@ class Processor(FlowProcessor):
|
|
41
45
|
)
|
42
46
|
)
|
43
47
|
|
44
|
-
|
48
|
+
logger.info("PDF OCR processor initialized")
|
45
49
|
|
46
50
|
async def on_message(self, msg, consumer, flow):
|
47
51
|
|
48
|
-
|
52
|
+
logger.info("PDF message received")
|
49
53
|
|
50
54
|
v = msg.value()
|
51
55
|
|
52
|
-
|
56
|
+
logger.info(f"Decoding {v.metadata.id}...")
|
53
57
|
|
54
58
|
blob = base64.b64decode(v.data)
|
55
59
|
|
@@ -60,7 +64,7 @@ class Processor(FlowProcessor):
|
|
60
64
|
try:
|
61
65
|
text = pytesseract.image_to_string(page, lang='eng')
|
62
66
|
except Exception as e:
|
63
|
-
|
67
|
+
logger.warning(f"Page did not OCR: {e}")
|
64
68
|
continue
|
65
69
|
|
66
70
|
r = TextDocument(
|
@@ -70,7 +74,7 @@ class Processor(FlowProcessor):
|
|
70
74
|
|
71
75
|
await flow("output").send(r)
|
72
76
|
|
73
|
-
|
77
|
+
logger.info("PDF decoding complete")
|
74
78
|
|
75
79
|
@staticmethod
|
76
80
|
def add_args(parser):
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.4"
|
@@ -1,15 +1,18 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: trustgraph-ocr
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.4
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
5
|
-
|
6
|
-
|
7
|
-
Author: trustgraph.ai
|
8
|
-
Author-email: security@trustgraph.ai
|
5
|
+
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
6
|
+
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
9
7
|
Classifier: Programming Language :: Python :: 3
|
10
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
11
8
|
Classifier: Operating System :: OS Independent
|
12
9
|
Requires-Python: >=3.8
|
13
10
|
Description-Content-Type: text/markdown
|
11
|
+
Requires-Dist: trustgraph-base<1.3,>=1.2
|
12
|
+
Requires-Dist: pulsar-client
|
13
|
+
Requires-Dist: prometheus-client
|
14
|
+
Requires-Dist: boto3
|
15
|
+
Requires-Dist: pdf2image
|
16
|
+
Requires-Dist: pytesseract
|
14
17
|
|
15
18
|
See https://trustgraph.ai/
|
@@ -1,6 +1,5 @@
|
|
1
1
|
README.md
|
2
|
-
|
3
|
-
scripts/pdf-ocr
|
2
|
+
pyproject.toml
|
4
3
|
trustgraph/ocr_version.py
|
5
4
|
trustgraph/decoding/ocr/__init__.py
|
6
5
|
trustgraph/decoding/ocr/__main__.py
|
@@ -8,5 +7,6 @@ trustgraph/decoding/ocr/pdf_decoder.py
|
|
8
7
|
trustgraph_ocr.egg-info/PKG-INFO
|
9
8
|
trustgraph_ocr.egg-info/SOURCES.txt
|
10
9
|
trustgraph_ocr.egg-info/dependency_links.txt
|
10
|
+
trustgraph_ocr.egg-info/entry_points.txt
|
11
11
|
trustgraph_ocr.egg-info/requires.txt
|
12
12
|
trustgraph_ocr.egg-info/top_level.txt
|
trustgraph-ocr-1.2.0/setup.py
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
import setuptools
|
2
|
-
import os
|
3
|
-
import importlib
|
4
|
-
|
5
|
-
with open("README.md", "r") as fh:
|
6
|
-
long_description = fh.read()
|
7
|
-
|
8
|
-
# Load a version number module
|
9
|
-
spec = importlib.util.spec_from_file_location(
|
10
|
-
'version', 'trustgraph/ocr_version.py'
|
11
|
-
)
|
12
|
-
version_module = importlib.util.module_from_spec(spec)
|
13
|
-
spec.loader.exec_module(version_module)
|
14
|
-
|
15
|
-
version = version_module.__version__
|
16
|
-
|
17
|
-
setuptools.setup(
|
18
|
-
name="trustgraph-ocr",
|
19
|
-
version=version,
|
20
|
-
author="trustgraph.ai",
|
21
|
-
author_email="security@trustgraph.ai",
|
22
|
-
description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.",
|
23
|
-
long_description=long_description,
|
24
|
-
long_description_content_type="text/markdown",
|
25
|
-
url="https://github.com/trustgraph-ai/trustgraph",
|
26
|
-
packages=setuptools.find_namespace_packages(
|
27
|
-
where='./',
|
28
|
-
),
|
29
|
-
classifiers=[
|
30
|
-
"Programming Language :: Python :: 3",
|
31
|
-
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
32
|
-
"Operating System :: OS Independent",
|
33
|
-
],
|
34
|
-
python_requires='>=3.8',
|
35
|
-
download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz",
|
36
|
-
install_requires=[
|
37
|
-
"trustgraph-base>=1.2,<1.3",
|
38
|
-
"pulsar-client",
|
39
|
-
"prometheus-client",
|
40
|
-
"boto3",
|
41
|
-
"pdf2image",
|
42
|
-
"pytesseract",
|
43
|
-
],
|
44
|
-
scripts=[
|
45
|
-
"scripts/pdf-ocr",
|
46
|
-
]
|
47
|
-
)
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.2.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|