trustgraph-ocr 1.2.3__tar.gz → 1.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-ocr
3
- Version: 1.2.3
3
+ Version: 1.2.5
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -6,12 +6,16 @@ PDF document as text as separate output objects.
6
6
 
7
7
  import tempfile
8
8
  import base64
9
+ import logging
9
10
  import pytesseract
10
11
  from pdf2image import convert_from_bytes
11
12
 
12
13
  from ... schema import Document, TextDocument, Metadata
13
14
  from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
14
15
 
16
+ # Module logger
17
+ logger = logging.getLogger(__name__)
18
+
15
19
  default_ident = "pdf-decoder"
16
20
 
17
21
  class Processor(FlowProcessor):
@@ -41,15 +45,15 @@ class Processor(FlowProcessor):
41
45
  )
42
46
  )
43
47
 
44
- print("PDF OCR inited")
48
+ logger.info("PDF OCR processor initialized")
45
49
 
46
50
  async def on_message(self, msg, consumer, flow):
47
51
 
48
- print("PDF message received", flush=True)
52
+ logger.info("PDF message received")
49
53
 
50
54
  v = msg.value()
51
55
 
52
- print(f"Decoding {v.metadata.id}...", flush=True)
56
+ logger.info(f"Decoding {v.metadata.id}...")
53
57
 
54
58
  blob = base64.b64decode(v.data)
55
59
 
@@ -60,7 +64,7 @@ class Processor(FlowProcessor):
60
64
  try:
61
65
  text = pytesseract.image_to_string(page, lang='eng')
62
66
  except Exception as e:
63
- print(f"Page did not OCR: {e}")
67
+ logger.warning(f"Page did not OCR: {e}")
64
68
  continue
65
69
 
66
70
  r = TextDocument(
@@ -70,7 +74,7 @@ class Processor(FlowProcessor):
70
74
 
71
75
  await flow("output").send(r)
72
76
 
73
- print("Done.", flush=True)
77
+ logger.info("PDF decoding complete")
74
78
 
75
79
  @staticmethod
76
80
  def add_args(parser):
@@ -0,0 +1 @@
1
+ __version__ = "1.2.5"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-ocr
3
- Version: 1.2.3
3
+ Version: 1.2.5
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -1 +0,0 @@
1
- __version__ = "1.2.3"
File without changes
File without changes