langchain-ocr-lib 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. langchain_ocr_lib-0.1.0/PKG-INFO +28 -0
  2. langchain_ocr_lib-0.1.0/README.md +0 -0
  3. langchain_ocr_lib-0.1.0/pyproject.toml +113 -0
  4. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/__init__.py +0 -0
  5. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/chains/__init__.py +0 -0
  6. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/chains/chain.py +55 -0
  7. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/converter/__init__.py +0 -0
  8. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/converter/converter.py +52 -0
  9. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/di_config.py +86 -0
  10. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/__init__.py +0 -0
  11. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/chains/__init__.py +0 -0
  12. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/chains/ocr_chain.py +86 -0
  13. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/converter/__init__.py +0 -0
  14. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/converter/image_converter.py +88 -0
  15. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/converter/pdf_converter.py +105 -0
  16. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/langfuse_manager/__init__.py +0 -0
  17. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/langfuse_manager/langfuse_manager.py +149 -0
  18. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/llms/__init__.py +0 -0
  19. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/llms/llm_factory.py +66 -0
  20. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/llms/llm_type.py +11 -0
  21. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/__init__.py +0 -0
  22. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/langfuse_settings.py +29 -0
  23. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/language_settings.py +25 -0
  24. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/llm_class_type_settings.py +27 -0
  25. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/ollama_chat_settings.py +42 -0
  26. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/openai_chat_settings.py +35 -0
  27. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/tracers/__init__.py +0 -0
  28. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/tracers/langfuse_traced_chain.py +44 -0
  29. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/language_mapping/language_mapping.py +19 -0
  30. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/main.py +122 -0
  31. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/prompt_templates/__init__.py +0 -0
  32. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/prompt_templates/ocr_prompt.py +60 -0
  33. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/tracers/__init__.py +0 -0
  34. langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/tracers/traced_chain.py +88 -0
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.3
2
+ Name: langchain-ocr-lib
3
+ Version: 0.1.0
4
+ Summary:
5
+ License: MIT
6
+ Author: Andreas Klos
7
+ Author-email: aklos@outlook.de
8
+ Requires-Python: >=3.11,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: deprecated (>=1.2.14,<2.0.0)
15
+ Requires-Dist: inject (>=5.2.1,<6.0.0)
16
+ Requires-Dist: langchain-community (>=0.3.19,<0.4.0)
17
+ Requires-Dist: langchain-ollama (>=0.2.0,<0.3.0)
18
+ Requires-Dist: langchain-openai (>=0.3.8,<0.4.0)
19
+ Requires-Dist: langfuse (>=2.59.7,<3.0.0)
20
+ Requires-Dist: openai (>=1.42.0,<2.0.0)
21
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
22
+ Requires-Dist: pillow (>=11.0.0,<12.0.0)
23
+ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
24
+ Requires-Dist: pytest-asyncio (>=0.25.0,<0.26.0)
25
+ Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
26
+ Description-Content-Type: text/markdown
27
+
28
+
File without changes
@@ -0,0 +1,113 @@
1
+ [build-system]
2
+ requires = ["poetry-core"]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
+ [tool.poetry.scripts]
6
+ langchain-ocr = "langchain_ocr_lib.main:main"
7
+
8
+ [tool.poetry]
9
+ name = "langchain-ocr-lib"
10
+ version = "0.1.0"
11
+ description = ""
12
+ authors = ["Andreas Klos <aklos@outlook.de>"]
13
+ readme = "README.md"
14
+ packages = [{ include = "langchain_ocr_lib", from = "src" }]
15
+ license = "MIT"
16
+
17
+ [tool.poetry.dependencies]
18
+ python = "^3.11"
19
+ deprecated = "^1.2.14"
20
+ pyyaml = "^6.0.2"
21
+ openai = "^1.42.0"
22
+ pillow = "^11.0.0"
23
+ langchain-ollama = "^0.2.0"
24
+ pytest-asyncio = "^0.25.0"
25
+ langchain-community = "^0.3.19"
26
+ langchain-openai = "^0.3.8"
27
+ langfuse = "^2.59.7"
28
+ pycountry = "^24.6.1"
29
+ pdf2image = "^1.17.0"
30
+ inject = "^5.2.1"
31
+
32
+ [tool.poetry.group.dev.dependencies]
33
+ debugpy = "^1.8.1"
34
+ pytest = "^8.2.1"
35
+ coverage = "^7.5.4"
36
+ flake8 = "^7.1.0"
37
+ flake8-black = "^0.3.6"
38
+ flake8-pyproject = "^1.2.3"
39
+ flake8-quotes = "^3.4.0"
40
+ flake8-return = "^1.2.0"
41
+ flake8-annotations-complexity = "^0.0.8"
42
+ flake8-bandit = "^4.1.1"
43
+ flake8-bugbear = "^24.8.19"
44
+ flake8-builtins = "^2.5.0"
45
+ flake8-comprehensions = "^3.15.0"
46
+ flake8-eradicate = "^1.5.0"
47
+ flake8-expression-complexity = "^0.0.11"
48
+ flake8-pytest-style = "^2.0.0"
49
+ pep8-naming = "^0.14.1"
50
+ flake8-eol = "^0.0.8"
51
+ flake8-exceptions = "^0.0.1a0"
52
+ flake8-simplify = "^0.21.0"
53
+ flake8-wot = "^0.2.0"
54
+ flake8-function-order = "^0.0.5"
55
+ flake8-tidy-imports = "^4.10.0"
56
+ black = "^23.9.1"
57
+ # flake8-logging-format = "^2024.24.12"
58
+ flake8-docstrings = "^1.7.0"
59
+
60
+
61
+ [tool.flake8]
62
+ exclude= [".eggs", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py", "src/langchain_ocr_lib/prompt_templates/ocr_prompt.py"]
63
+ statistics = true
64
+ show-source = false
65
+ max-complexity = 8
66
+ max-annotations-complexity = 3
67
+ docstring-convention = 'numpy'
68
+ max-line-length = 120
69
+ ignore = ["E203", "W503", "E704"]
70
+ inline-quotes = '"'
71
+ docstring-quotes = '"""'
72
+ multiline-quotes = '"""'
73
+ dictionaries = ["en_US", "python", "technical", "pandas"]
74
+ ban-relative-imports = true
75
+ per-file-ignores = """
76
+
77
+ """
78
+
79
+ [tool.black]
80
+ line-length = 120
81
+ exclude = """
82
+ /(
83
+ .eggs
84
+ | .git
85
+ | .hg
86
+ | .mypy_cache
87
+ | .nox
88
+ | .pants.d
89
+ | .tox
90
+ | .venv
91
+ | _build
92
+ | buck-out
93
+ | build
94
+ | dist
95
+ | node_modules
96
+ | venv
97
+ )/
98
+ """
99
+
100
+ [tool.isort]
101
+ profile = "black"
102
+ skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv']
103
+ skip_gitignore = true
104
+ known_local_folder = ["langchain_ocr_lib"]
105
+
106
+ [tool.pylint]
107
+ max-line-length = 120
108
+
109
+ [tool.pytest.ini_options]
110
+ log_cli = 1
111
+ log_cli_level = "DEBUG"
112
+ testpaths = "tests"
113
+ pythonpath = "src"
@@ -0,0 +1,55 @@
1
+ """Module for the base class of chains."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional
5
+
6
+ from langchain_core.runnables import Runnable, RunnableConfig
7
+ from langchain_core.runnables.utils import Input, Output
8
+
9
+
10
+ class Chain(Runnable[Input, Output], ABC):
11
+ """Base class for chains."""
12
+
13
+ @abstractmethod
14
+ async def ainvoke(self, chain_input: Input, config: Optional[RunnableConfig] = None, **kwargs: Any) -> Output:
15
+ """Asynchronously invoke the chain with the given input and configuration.
16
+
17
+ Parameters
18
+ ----------
19
+ chain_input : Input
20
+ The input data required to asynchronously invoke the chain.
21
+ config : Optional[RunnableConfig], optional
22
+ The configuration settings for the chain invocation, by default None.
23
+ **kwargs : Any
24
+ Additional keyword arguments that may be required for the chain invocation.
25
+
26
+ Returns
27
+ -------
28
+ Output
29
+ The result of the chain invocation.
30
+ """
31
+
32
+ @abstractmethod
33
+ def invoke(self, chain_input: Input, config: Optional[RunnableConfig] = None, **kwargs: Any) -> Output:
34
+ """
35
+ Invoke the chain with the given input and configuration.
36
+
37
+ Typing indicates `Output` will be the return, but because no implementation is planned,
38
+ this will never be returned. This method is not implemented and will raise a not implemented error.
39
+
40
+ Notes
41
+ -----
42
+ This method should never be called. It exists only because the base class requires an implementation.
43
+
44
+ Parameters
45
+ ----------
46
+ chain_input : Input
47
+ The input data required to invoke the chain.
48
+ config : Optional[RunnableConfig], optional
49
+ The configuration settings for the chain invocation, by default None.
50
+
51
+ Returns
52
+ -------
53
+ Output
54
+ The result of the chain invocation.
55
+ """
@@ -0,0 +1,52 @@
1
+ """Module for the File2MarkdownConverter class."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ import inject
5
+
6
+
7
+ class File2MarkdownConverter(ABC):
8
+ """Abstract base class for the File2MarkdownConverter class."""
9
+
10
+ _chain = inject.attr("LangfuseTracedChain")
11
+
12
+ @abstractmethod
13
+ async def aconvert2markdown(self, file: bytes) -> str:
14
+ """Asynchronously convert file to markdown format.
15
+
16
+ Parameters
17
+ ----------
18
+ file : bytes
19
+ The file to convert.
20
+
21
+ Returns
22
+ -------
23
+ str
24
+ The markdown representation of the file.
25
+
26
+ Raises
27
+ ------
28
+ NotImplementedError
29
+ If the method is not implemented.
30
+ """
31
+ raise NotImplementedError
32
+
33
+ @abstractmethod
34
+ def convert2markdown(self, file: bytes) -> str:
35
+ """Convert file to markdown format.
36
+
37
+ Parameters
38
+ ----------
39
+ file : bytes
40
+ The file to convert.
41
+
42
+ Returns
43
+ -------
44
+ str
45
+ The markdown representation of the file.
46
+
47
+ Raises
48
+ ------
49
+ NotImplementedError
50
+ If the method is not implemented.
51
+ """
52
+ raise NotImplementedError
@@ -0,0 +1,86 @@
1
+ """Module containing the dependency injection container for managing application dependencies."""
2
+
3
+ from inject import Binder
4
+ import inject
5
+ from langchain_ollama import ChatOllama
6
+ from langchain_openai import ChatOpenAI
7
+ from langfuse import Langfuse
8
+
9
+ from langchain_ocr_lib.impl.chains.ocr_chain import OcrChain
10
+ from langchain_ocr_lib.impl.settings.ollama_chat_settings import OllamaSettings
11
+ from langchain_ocr_lib.impl.settings.openai_chat_settings import OpenAISettings
12
+ from langchain_ocr_lib.impl.settings.llm_class_type_settings import LlmClassTypeSettings
13
+ from langchain_ocr_lib.impl.settings.langfuse_settings import LangfuseSettings
14
+ from langchain_ocr_lib.impl.settings.language_settings import LanguageSettings
15
+ from langchain_ocr_lib.impl.tracers.langfuse_traced_chain import LangfuseTracedChain
16
+ from langchain_ocr_lib.prompt_templates.ocr_prompt import ocr_prompt_template_builder
17
+ from langchain_ocr_lib.impl.llms.llm_factory import llm_provider
18
+ from langchain_ocr_lib.impl.langfuse_manager.langfuse_manager import LangfuseManager
19
+ from langchain_ocr_lib.impl.converter.pdf_converter import Pdf2MarkdownConverter
20
+ from langchain_ocr_lib.impl.converter.image_converter import Image2MarkdownConverter
21
+
22
+
23
+ def lib_di_config(binder: Binder):
24
+ """Configure dependency injection bindings for the OCR library.
25
+
26
+ Parameters
27
+ ----------
28
+ binder : Binder
29
+ The dependency injection binder instance used to register the bindings.
30
+
31
+ Raises
32
+ ------
33
+ NotImplementedError
34
+ If the configured LLM type is not implemented.
35
+
36
+ """
37
+ langfuse_settings = LangfuseSettings()
38
+ llm_class_type_settings = LlmClassTypeSettings()
39
+ language_settings = LanguageSettings()
40
+
41
+ if llm_class_type_settings.llm_type == "ollama":
42
+ settings = OllamaSettings()
43
+ llm_instance = llm_provider(settings, ChatOllama)
44
+ elif llm_class_type_settings.llm_type == "openai":
45
+ settings = OpenAISettings()
46
+ llm_instance = llm_provider(settings, ChatOpenAI)
47
+ else:
48
+ raise NotImplementedError("Configured LLM is not implemented")
49
+ binder.bind("LargeLanguageModel", llm_instance)
50
+
51
+ prompt = ocr_prompt_template_builder(language=language_settings.language, model_name=settings.model)
52
+
53
+ binder.bind(
54
+ "LangfuseClient",
55
+ Langfuse(
56
+ public_key=langfuse_settings.public_key,
57
+ secret_key=langfuse_settings.secret_key,
58
+ host=langfuse_settings.host,
59
+ ),
60
+ )
61
+
62
+ binder.bind(
63
+ "LangfuseManager",
64
+ LangfuseManager(
65
+ managed_prompts={
66
+ OcrChain.__name__: prompt,
67
+ },
68
+ ),
69
+ )
70
+
71
+ binder.bind("OcrChain", OcrChain())
72
+
73
+ binder.bind(
74
+ "LangfuseTracedChain",
75
+ LangfuseTracedChain(
76
+ settings=langfuse_settings,
77
+ ),
78
+ )
79
+
80
+ binder.bind("PdfConverter", Pdf2MarkdownConverter())
81
+ binder.bind("ImageConverter", Image2MarkdownConverter())
82
+
83
+
84
+ def configure_di():
85
+ """Configure dependency injection using the `inject` library."""
86
+ inject.configure(lib_di_config, allow_override=True, clear=True)
@@ -0,0 +1,86 @@
1
+ """Module for LLM answer generation chain."""
2
+
3
+ from typing import Any, Optional
4
+
5
+ from langchain_core.runnables import Runnable, RunnableConfig
6
+ from langchain_core.runnables.utils import Input
7
+ import inject
8
+
9
+ from langchain_ocr_lib.chains.chain import Chain
10
+
11
+ RunnableInput = Input # TODO: adjust properly
12
+ RunnableOutput = str
13
+
14
+
15
+ class OcrChain(Chain[RunnableInput, RunnableOutput]):
16
+ """Base class for LLM answer generation chain."""
17
+
18
+ _langfuse_manager = inject.attr("LangfuseManager")
19
+
20
+ def __init__(self):
21
+ """Initialize the AnswerGenerationChain.
22
+
23
+ Parameters
24
+ ----------
25
+ langfuse_manager : LangfuseManager
26
+ Manager instance for handling Langfuse operations and monitoring
27
+ """
28
+
29
+ async def ainvoke(
30
+ self, chain_input: RunnableInput, config: Optional[RunnableConfig] = None, **kwargs: Any
31
+ ) -> RunnableOutput:
32
+ """
33
+ Asynchronously invokes the chain with given input.
34
+
35
+ Parameters
36
+ ----------
37
+ chain_input : RunnableInput
38
+ The input to be processed by the chain.
39
+ chain_config : Optional[RunnableConfig]
40
+ Configuration for the chain execution (default None).
41
+ **kwargs : Any
42
+ Additional keyword arguments passed to the chain.
43
+
44
+ Returns
45
+ -------
46
+ RunnableOutput
47
+ The output generated by the chain.
48
+
49
+ Raises
50
+ ------
51
+ ChainError
52
+ If an error occurs during chain execution.
53
+ """
54
+ return await self._create_chain().ainvoke(chain_input, config=config)
55
+
56
+ def invoke(
57
+ self, chain_input: RunnableInput, config: Optional[RunnableConfig] = None, **kwargs: Any
58
+ ) -> RunnableOutput:
59
+ """
60
+ Invoke the chain with given input.
61
+
62
+ Parameters
63
+ ----------
64
+ chain_input : RunnableInput
65
+ The input to be processed by the chain.
66
+ chain_config : Optional[RunnableConfig]
67
+ Configuration for the chain execution (default None).
68
+ **kwargs : Any
69
+ Additional keyword arguments passed to the chain.
70
+
71
+ Returns
72
+ -------
73
+ RunnableOutput
74
+ The output generated by the chain.
75
+
76
+ Raises
77
+ ------
78
+ ChainError
79
+ If an error occurs during chain execution.
80
+ """
81
+ return self._create_chain().invoke(chain_input, config=config)
82
+
83
+ def _create_chain(self) -> Runnable:
84
+ return self._langfuse_manager.get_base_prompt(self.__class__.__name__) | self._langfuse_manager.get_base_llm(
85
+ self.__class__.__name__
86
+ )
@@ -0,0 +1,88 @@
1
+ """Module for converting an image to markdown using a Langchain chain."""
2
+
3
+ import io
4
+ import base64
5
+ from PIL import Image
6
+ from PIL.ImageFile import ImageFile
7
+
8
+ from langchain_ocr_lib.converter.converter import File2MarkdownConverter
9
+
10
+
11
+ class Image2MarkdownConverter(File2MarkdownConverter):
12
+ """Converts an image to markdown using a Langchain chain."""
13
+
14
+ async def aconvert2markdown(self, file: ImageFile | None = None, filename: str | None = None) -> str:
15
+ """
16
+ Asynchronously converts an image to markdown using a Langchain chain.
17
+
18
+ Parameters
19
+ ----------
20
+ file : ImageFile | None, optional
21
+ PIL Image object to convert, by default None
22
+ filename : str | None, optional
23
+ Path to the image file to convert, by default None
24
+
25
+ Returns
26
+ -------
27
+ str
28
+ Markdown representation of the image.
29
+
30
+ Raises
31
+ ------
32
+ ValueError
33
+ If no file or filename is provided.
34
+ ValueError
35
+ If the file is corrupted or the file type is unsupported.
36
+ """
37
+ if file is None and filename is None:
38
+ raise ValueError("No file provided")
39
+ if file is None:
40
+ try:
41
+ file = Image.open(filename)
42
+ except Exception as e:
43
+ raise ValueError("Image corrupted or unsupported file type, %s" % e)
44
+
45
+ buf = io.BytesIO()
46
+ file.save(buf, format="PNG")
47
+ base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
48
+ response = await self._chain.ainvoke({"image_data": base64_img})
49
+
50
+ return response.content
51
+
52
+ def convert2markdown(self, file: ImageFile | None = None, filename: str | None = None) -> str:
53
+ """
54
+ Convert an image to markdown using a Langchain chain.
55
+
56
+ Parameters
57
+ ----------
58
+ file : ImageFile | None, optional
59
+ PIL Image object to convert, by default None
60
+ filename : str | None, optional
61
+ Path to the image file to convert, by default None
62
+
63
+ Returns
64
+ -------
65
+ str
66
+ Markdown representation of the image.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If no file or filename is provided.
72
+ ValueError
73
+ If the file is corrupted or the file type is unsupported.
74
+ """
75
+ if file is None and filename is None:
76
+ raise ValueError("No file provided")
77
+ if file is None:
78
+ try:
79
+ file = Image.open(filename)
80
+ except Exception as e:
81
+ raise ValueError("Image corrupted or unsupported file type, %s" % e)
82
+
83
+ buf = io.BytesIO()
84
+ file.save(buf, format="PNG")
85
+ base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
86
+ response = self._chain.invoke({"image_data": base64_img})
87
+
88
+ return response.content
@@ -0,0 +1,105 @@
1
+ """Module for converting PDF files to markdown."""
2
+
3
+ from pdf2image import convert_from_bytes
4
+ import io
5
+ import base64
6
+
7
+ from langchain_ocr_lib.converter.converter import File2MarkdownConverter
8
+
9
+
10
+ class Pdf2MarkdownConverter(File2MarkdownConverter):
11
+ """Converts PDF files to markdown format.
12
+
13
+ This class provides methods to convert PDF files, either provided as bytes or by filename,
14
+ into markdown format.
15
+
16
+ Attributes
17
+ ----------
18
+ _chain : Chain
19
+ The OCR chain used to process images.
20
+ """
21
+
22
+ async def aconvert2markdown(self, file: bytes | None = None, filename: str | None = None) -> str:
23
+ """Asynchronously converts a PDF file (either provided as bytes or by filename) into markdown.
24
+
25
+ Parameters
26
+ ----------
27
+ file : bytes, optional
28
+ The PDF file as bytes. Defaults to None.
29
+ filename : str, optional
30
+ The path to the PDF file. Defaults to None.
31
+
32
+ Returns
33
+ -------
34
+ str
35
+ The markdown representation of the PDF content extracted via OCR.
36
+
37
+ Raises
38
+ ------
39
+ ValueError
40
+ If neither `file` nor `filename` is provided.
41
+ ValueError
42
+ If the PDF file is corrupted or the file type is unsupported.
43
+ """
44
+ if file is None and filename is None:
45
+ raise ValueError("No file provided")
46
+ if file is None:
47
+ try:
48
+ with open(filename, "rb") as f:
49
+ file = f.read()
50
+ except Exception as e:
51
+ raise ValueError("PDF corrupted or unsupported file type, %s" % e)
52
+
53
+ images = convert_from_bytes(file)
54
+
55
+ markdown = ""
56
+ for image in images:
57
+ # Wrap the image in a Document if your chain expects it.
58
+ buf = io.BytesIO()
59
+ image.save(buf, format="PNG")
60
+ base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
61
+ response = await self._chain.ainvoke({"image_data": base64_img})
62
+ markdown += response.content
63
+ return markdown
64
+
65
+ def convert2markdown(self, file: bytes | None = None, filename: str | None = None) -> str:
66
+ """Convert a PDF file (either provided as bytes or by filename) into markdown.
67
+
68
+ Parameters
69
+ ----------
70
+ file : bytes, optional
71
+ The PDF file as bytes. Defaults to None.
72
+ filename : str, optional
73
+ The path to the PDF file. Defaults to None.
74
+
75
+ Returns
76
+ -------
77
+ str
78
+ The markdown representation of the PDF content extracted via OCR.
79
+
80
+ Raises
81
+ ------
82
+ ValueError
83
+ If neither `file` nor `filename` is provided.
84
+ ValueError
85
+ If the PDF file is corrupted or the file type is unsupported.
86
+ """
87
+ if file is None and filename is None:
88
+ raise ValueError("No file provided")
89
+ if file is None:
90
+ try:
91
+ with open(filename, "rb") as f:
92
+ file = f.read()
93
+ except Exception as e:
94
+ raise ValueError("PDF corrupted or unsupported file type") from e
95
+
96
+ images = convert_from_bytes(file)
97
+
98
+ markdown = ""
99
+ for image in images:
100
+ buf = io.BytesIO()
101
+ image.save(buf, format="PNG")
102
+ base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
103
+ response = self._chain.invoke({"image_data": base64_img})
104
+ markdown += response.content
105
+ return markdown