trustgraph-ocr 1.2.0__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,18 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: trustgraph-ocr
3
- Version: 1.2.0
3
+ Version: 1.2.4
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
- Home-page: https://github.com/trustgraph-ai/trustgraph
6
- Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v1.2.0.tar.gz
7
- Author: trustgraph.ai
8
- Author-email: security@trustgraph.ai
5
+ Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
+ Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
9
7
  Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
8
  Classifier: Operating System :: OS Independent
12
9
  Requires-Python: >=3.8
13
10
  Description-Content-Type: text/markdown
11
+ Requires-Dist: trustgraph-base<1.3,>=1.2
12
+ Requires-Dist: pulsar-client
13
+ Requires-Dist: prometheus-client
14
+ Requires-Dist: boto3
15
+ Requires-Dist: pdf2image
16
+ Requires-Dist: pytesseract
14
17
 
15
18
  See https://trustgraph.ai/
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "trustgraph-ocr"
7
+ dynamic = ["version"]
8
+ authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
9
+ description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
10
+ readme = "README.md"
11
+ requires-python = ">=3.8"
12
+ dependencies = [
13
+ "trustgraph-base>=1.2,<1.3",
14
+ "pulsar-client",
15
+ "prometheus-client",
16
+ "boto3",
17
+ "pdf2image",
18
+ "pytesseract",
19
+ ]
20
+ classifiers = [
21
+ "Programming Language :: Python :: 3",
22
+ "Operating System :: OS Independent",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/trustgraph-ai/trustgraph"
27
+
28
+ [project.scripts]
29
+ pdf-ocr = "trustgraph.decoding.ocr:run"
30
+
31
+ [tool.setuptools.packages.find]
32
+ include = ["trustgraph*"]
33
+
34
+ [tool.setuptools.dynamic]
35
+ version = {attr = "trustgraph.ocr_version.__version__"}
@@ -6,12 +6,16 @@ PDF document as text as separate output objects.
6
6
 
7
7
  import tempfile
8
8
  import base64
9
+ import logging
9
10
  import pytesseract
10
11
  from pdf2image import convert_from_bytes
11
12
 
12
13
  from ... schema import Document, TextDocument, Metadata
13
14
  from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
14
15
 
16
+ # Module logger
17
+ logger = logging.getLogger(__name__)
18
+
15
19
  default_ident = "pdf-decoder"
16
20
 
17
21
  class Processor(FlowProcessor):
@@ -41,15 +45,15 @@ class Processor(FlowProcessor):
41
45
  )
42
46
  )
43
47
 
44
- print("PDF OCR inited")
48
+ logger.info("PDF OCR processor initialized")
45
49
 
46
50
  async def on_message(self, msg, consumer, flow):
47
51
 
48
- print("PDF message received", flush=True)
52
+ logger.info("PDF message received")
49
53
 
50
54
  v = msg.value()
51
55
 
52
- print(f"Decoding {v.metadata.id}...", flush=True)
56
+ logger.info(f"Decoding {v.metadata.id}...")
53
57
 
54
58
  blob = base64.b64decode(v.data)
55
59
 
@@ -60,7 +64,7 @@ class Processor(FlowProcessor):
60
64
  try:
61
65
  text = pytesseract.image_to_string(page, lang='eng')
62
66
  except Exception as e:
63
- print(f"Page did not OCR: {e}")
67
+ logger.warning(f"Page did not OCR: {e}")
64
68
  continue
65
69
 
66
70
  r = TextDocument(
@@ -70,7 +74,7 @@ class Processor(FlowProcessor):
70
74
 
71
75
  await flow("output").send(r)
72
76
 
73
- print("Done.", flush=True)
77
+ logger.info("PDF decoding complete")
74
78
 
75
79
  @staticmethod
76
80
  def add_args(parser):
@@ -0,0 +1 @@
1
+ __version__ = "1.2.4"
@@ -1,15 +1,18 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: trustgraph-ocr
3
- Version: 1.2.0
3
+ Version: 1.2.4
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
- Home-page: https://github.com/trustgraph-ai/trustgraph
6
- Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v1.2.0.tar.gz
7
- Author: trustgraph.ai
8
- Author-email: security@trustgraph.ai
5
+ Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
+ Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
9
7
  Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
8
  Classifier: Operating System :: OS Independent
12
9
  Requires-Python: >=3.8
13
10
  Description-Content-Type: text/markdown
11
+ Requires-Dist: trustgraph-base<1.3,>=1.2
12
+ Requires-Dist: pulsar-client
13
+ Requires-Dist: prometheus-client
14
+ Requires-Dist: boto3
15
+ Requires-Dist: pdf2image
16
+ Requires-Dist: pytesseract
14
17
 
15
18
  See https://trustgraph.ai/
@@ -1,6 +1,5 @@
1
1
  README.md
2
- setup.py
3
- scripts/pdf-ocr
2
+ pyproject.toml
4
3
  trustgraph/ocr_version.py
5
4
  trustgraph/decoding/ocr/__init__.py
6
5
  trustgraph/decoding/ocr/__main__.py
@@ -8,5 +7,6 @@ trustgraph/decoding/ocr/pdf_decoder.py
8
7
  trustgraph_ocr.egg-info/PKG-INFO
9
8
  trustgraph_ocr.egg-info/SOURCES.txt
10
9
  trustgraph_ocr.egg-info/dependency_links.txt
10
+ trustgraph_ocr.egg-info/entry_points.txt
11
11
  trustgraph_ocr.egg-info/requires.txt
12
12
  trustgraph_ocr.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdf-ocr = trustgraph.decoding.ocr:run
@@ -1,6 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- from trustgraph.decoding.ocr import run
4
-
5
- run()
6
-
@@ -1,47 +0,0 @@
1
- import setuptools
2
- import os
3
- import importlib
4
-
5
- with open("README.md", "r") as fh:
6
- long_description = fh.read()
7
-
8
- # Load a version number module
9
- spec = importlib.util.spec_from_file_location(
10
- 'version', 'trustgraph/ocr_version.py'
11
- )
12
- version_module = importlib.util.module_from_spec(spec)
13
- spec.loader.exec_module(version_module)
14
-
15
- version = version_module.__version__
16
-
17
- setuptools.setup(
18
- name="trustgraph-ocr",
19
- version=version,
20
- author="trustgraph.ai",
21
- author_email="security@trustgraph.ai",
22
- description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.",
23
- long_description=long_description,
24
- long_description_content_type="text/markdown",
25
- url="https://github.com/trustgraph-ai/trustgraph",
26
- packages=setuptools.find_namespace_packages(
27
- where='./',
28
- ),
29
- classifiers=[
30
- "Programming Language :: Python :: 3",
31
- "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
32
- "Operating System :: OS Independent",
33
- ],
34
- python_requires='>=3.8',
35
- download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz",
36
- install_requires=[
37
- "trustgraph-base>=1.2,<1.3",
38
- "pulsar-client",
39
- "prometheus-client",
40
- "boto3",
41
- "pdf2image",
42
- "pytesseract",
43
- ],
44
- scripts=[
45
- "scripts/pdf-ocr",
46
- ]
47
- )
@@ -1 +0,0 @@
1
- __version__ = "1.2.0"
File without changes
File without changes
@@ -1,6 +1,6 @@
1
+ trustgraph-base<1.3,>=1.2
2
+ pulsar-client
3
+ prometheus-client
1
4
  boto3
2
5
  pdf2image
3
- prometheus-client
4
- pulsar-client
5
6
  pytesseract
6
- trustgraph-base<1.3,>=1.2