langchain-ocr-lib 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_ocr_lib-0.1.0/PKG-INFO +28 -0
- langchain_ocr_lib-0.1.0/README.md +0 -0
- langchain_ocr_lib-0.1.0/pyproject.toml +113 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/chains/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/chains/chain.py +55 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/converter/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/converter/converter.py +52 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/di_config.py +86 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/chains/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/chains/ocr_chain.py +86 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/converter/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/converter/image_converter.py +88 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/converter/pdf_converter.py +105 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/langfuse_manager/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/langfuse_manager/langfuse_manager.py +149 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/llms/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/llms/llm_factory.py +66 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/llms/llm_type.py +11 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/langfuse_settings.py +29 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/language_settings.py +25 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/llm_class_type_settings.py +27 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/ollama_chat_settings.py +42 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/settings/openai_chat_settings.py +35 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/tracers/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/impl/tracers/langfuse_traced_chain.py +44 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/language_mapping/language_mapping.py +19 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/main.py +122 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/prompt_templates/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/prompt_templates/ocr_prompt.py +60 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/tracers/__init__.py +0 -0
- langchain_ocr_lib-0.1.0/src/langchain_ocr_lib/tracers/traced_chain.py +88 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: langchain-ocr-lib
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary:
|
5
|
+
License: MIT
|
6
|
+
Author: Andreas Klos
|
7
|
+
Author-email: aklos@outlook.de
|
8
|
+
Requires-Python: >=3.11,<4.0
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
14
|
+
Requires-Dist: deprecated (>=1.2.14,<2.0.0)
|
15
|
+
Requires-Dist: inject (>=5.2.1,<6.0.0)
|
16
|
+
Requires-Dist: langchain-community (>=0.3.19,<0.4.0)
|
17
|
+
Requires-Dist: langchain-ollama (>=0.2.0,<0.3.0)
|
18
|
+
Requires-Dist: langchain-openai (>=0.3.8,<0.4.0)
|
19
|
+
Requires-Dist: langfuse (>=2.59.7,<3.0.0)
|
20
|
+
Requires-Dist: openai (>=1.42.0,<2.0.0)
|
21
|
+
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
22
|
+
Requires-Dist: pillow (>=11.0.0,<12.0.0)
|
23
|
+
Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
24
|
+
Requires-Dist: pytest-asyncio (>=0.25.0,<0.26.0)
|
25
|
+
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
|
28
|
+
|
File without changes
|
@@ -0,0 +1,113 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["poetry-core"]
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
4
|
+
|
5
|
+
[tool.poetry.scripts]
|
6
|
+
langchain-ocr = "langchain_ocr_lib.main:main"
|
7
|
+
|
8
|
+
[tool.poetry]
|
9
|
+
name = "langchain-ocr-lib"
|
10
|
+
version = "0.1.0"
|
11
|
+
description = ""
|
12
|
+
authors = ["Andreas Klos <aklos@outlook.de>"]
|
13
|
+
readme = "README.md"
|
14
|
+
packages = [{ include = "langchain_ocr_lib", from = "src" }]
|
15
|
+
license = "MIT"
|
16
|
+
|
17
|
+
[tool.poetry.dependencies]
|
18
|
+
python = "^3.11"
|
19
|
+
deprecated = "^1.2.14"
|
20
|
+
pyyaml = "^6.0.2"
|
21
|
+
openai = "^1.42.0"
|
22
|
+
pillow = "^11.0.0"
|
23
|
+
langchain-ollama = "^0.2.0"
|
24
|
+
pytest-asyncio = "^0.25.0"
|
25
|
+
langchain-community = "^0.3.19"
|
26
|
+
langchain-openai = "^0.3.8"
|
27
|
+
langfuse = "^2.59.7"
|
28
|
+
pycountry = "^24.6.1"
|
29
|
+
pdf2image = "^1.17.0"
|
30
|
+
inject = "^5.2.1"
|
31
|
+
|
32
|
+
[tool.poetry.group.dev.dependencies]
|
33
|
+
debugpy = "^1.8.1"
|
34
|
+
pytest = "^8.2.1"
|
35
|
+
coverage = "^7.5.4"
|
36
|
+
flake8 = "^7.1.0"
|
37
|
+
flake8-black = "^0.3.6"
|
38
|
+
flake8-pyproject = "^1.2.3"
|
39
|
+
flake8-quotes = "^3.4.0"
|
40
|
+
flake8-return = "^1.2.0"
|
41
|
+
flake8-annotations-complexity = "^0.0.8"
|
42
|
+
flake8-bandit = "^4.1.1"
|
43
|
+
flake8-bugbear = "^24.8.19"
|
44
|
+
flake8-builtins = "^2.5.0"
|
45
|
+
flake8-comprehensions = "^3.15.0"
|
46
|
+
flake8-eradicate = "^1.5.0"
|
47
|
+
flake8-expression-complexity = "^0.0.11"
|
48
|
+
flake8-pytest-style = "^2.0.0"
|
49
|
+
pep8-naming = "^0.14.1"
|
50
|
+
flake8-eol = "^0.0.8"
|
51
|
+
flake8-exceptions = "^0.0.1a0"
|
52
|
+
flake8-simplify = "^0.21.0"
|
53
|
+
flake8-wot = "^0.2.0"
|
54
|
+
flake8-function-order = "^0.0.5"
|
55
|
+
flake8-tidy-imports = "^4.10.0"
|
56
|
+
black = "^23.9.1"
|
57
|
+
# flake8-logging-format = "^2024.24.12"
|
58
|
+
flake8-docstrings = "^1.7.0"
|
59
|
+
|
60
|
+
|
61
|
+
[tool.flake8]
|
62
|
+
exclude= [".eggs", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py", "src/langchain_ocr_lib/prompt_templates/ocr_prompt.py"]
|
63
|
+
statistics = true
|
64
|
+
show-source = false
|
65
|
+
max-complexity = 8
|
66
|
+
max-annotations-complexity = 3
|
67
|
+
docstring-convention = 'numpy'
|
68
|
+
max-line-length = 120
|
69
|
+
ignore = ["E203", "W503", "E704"]
|
70
|
+
inline-quotes = '"'
|
71
|
+
docstring-quotes = '"""'
|
72
|
+
multiline-quotes = '"""'
|
73
|
+
dictionaries = ["en_US", "python", "technical", "pandas"]
|
74
|
+
ban-relative-imports = true
|
75
|
+
per-file-ignores = """
|
76
|
+
|
77
|
+
"""
|
78
|
+
|
79
|
+
[tool.black]
|
80
|
+
line-length = 120
|
81
|
+
exclude = """
|
82
|
+
/(
|
83
|
+
.eggs
|
84
|
+
| .git
|
85
|
+
| .hg
|
86
|
+
| .mypy_cache
|
87
|
+
| .nox
|
88
|
+
| .pants.d
|
89
|
+
| .tox
|
90
|
+
| .venv
|
91
|
+
| _build
|
92
|
+
| buck-out
|
93
|
+
| build
|
94
|
+
| dist
|
95
|
+
| node_modules
|
96
|
+
| venv
|
97
|
+
)/
|
98
|
+
"""
|
99
|
+
|
100
|
+
[tool.isort]
|
101
|
+
profile = "black"
|
102
|
+
skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv']
|
103
|
+
skip_gitignore = true
|
104
|
+
known_local_folder = ["langchain_ocr_lib"]
|
105
|
+
|
106
|
+
[tool.pylint]
|
107
|
+
max-line-length = 120
|
108
|
+
|
109
|
+
[tool.pytest.ini_options]
|
110
|
+
log_cli = 1
|
111
|
+
log_cli_level = "DEBUG"
|
112
|
+
testpaths = "tests"
|
113
|
+
pythonpath = "src"
|
File without changes
|
File without changes
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Module for the base class of chains."""
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Any, Optional
|
5
|
+
|
6
|
+
from langchain_core.runnables import Runnable, RunnableConfig
|
7
|
+
from langchain_core.runnables.utils import Input, Output
|
8
|
+
|
9
|
+
|
10
|
+
class Chain(Runnable[Input, Output], ABC):
|
11
|
+
"""Base class for chains."""
|
12
|
+
|
13
|
+
@abstractmethod
|
14
|
+
async def ainvoke(self, chain_input: Input, config: Optional[RunnableConfig] = None, **kwargs: Any) -> Output:
|
15
|
+
"""Asynchronously invoke the chain with the given input and configuration.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
chain_input : Input
|
20
|
+
The input data required to asynchronously invoke the chain.
|
21
|
+
config : Optional[RunnableConfig], optional
|
22
|
+
The configuration settings for the chain invocation, by default None.
|
23
|
+
**kwargs : Any
|
24
|
+
Additional keyword arguments that may be required for the chain invocation.
|
25
|
+
|
26
|
+
Returns
|
27
|
+
-------
|
28
|
+
Output
|
29
|
+
The result of the chain invocation.
|
30
|
+
"""
|
31
|
+
|
32
|
+
@abstractmethod
|
33
|
+
def invoke(self, chain_input: Input, config: Optional[RunnableConfig] = None, **kwargs: Any) -> Output:
|
34
|
+
"""
|
35
|
+
Invoke the chain with the given input and configuration.
|
36
|
+
|
37
|
+
Typing indicates `Output` will be the return, but because no implementation is planned,
|
38
|
+
this will never be returned. This method is not implemented and will raise a not implemented error.
|
39
|
+
|
40
|
+
Notes
|
41
|
+
-----
|
42
|
+
This method should never be called. It exists only because the base class requires an implementation.
|
43
|
+
|
44
|
+
Parameters
|
45
|
+
----------
|
46
|
+
chain_input : Input
|
47
|
+
The input data required to invoke the chain.
|
48
|
+
config : Optional[RunnableConfig], optional
|
49
|
+
The configuration settings for the chain invocation, by default None.
|
50
|
+
|
51
|
+
Returns
|
52
|
+
-------
|
53
|
+
Output
|
54
|
+
The result of the chain invocation.
|
55
|
+
"""
|
File without changes
|
@@ -0,0 +1,52 @@
|
|
1
|
+
"""Module for the File2MarkdownConverter class."""
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
import inject
|
5
|
+
|
6
|
+
|
7
|
+
class File2MarkdownConverter(ABC):
|
8
|
+
"""Abstract base class for the File2MarkdownConverter class."""
|
9
|
+
|
10
|
+
_chain = inject.attr("LangfuseTracedChain")
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
async def aconvert2markdown(self, file: bytes) -> str:
|
14
|
+
"""Asynchronously convert file to markdown format.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
----------
|
18
|
+
file : bytes
|
19
|
+
The file to convert.
|
20
|
+
|
21
|
+
Returns
|
22
|
+
-------
|
23
|
+
str
|
24
|
+
The markdown representation of the file.
|
25
|
+
|
26
|
+
Raises
|
27
|
+
------
|
28
|
+
NotImplementedError
|
29
|
+
If the method is not implemented.
|
30
|
+
"""
|
31
|
+
raise NotImplementedError
|
32
|
+
|
33
|
+
@abstractmethod
|
34
|
+
def convert2markdown(self, file: bytes) -> str:
|
35
|
+
"""Convert file to markdown format.
|
36
|
+
|
37
|
+
Parameters
|
38
|
+
----------
|
39
|
+
file : bytes
|
40
|
+
The file to convert.
|
41
|
+
|
42
|
+
Returns
|
43
|
+
-------
|
44
|
+
str
|
45
|
+
The markdown representation of the file.
|
46
|
+
|
47
|
+
Raises
|
48
|
+
------
|
49
|
+
NotImplementedError
|
50
|
+
If the method is not implemented.
|
51
|
+
"""
|
52
|
+
raise NotImplementedError
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""Module containing the dependency injection container for managing application dependencies."""
|
2
|
+
|
3
|
+
from inject import Binder
|
4
|
+
import inject
|
5
|
+
from langchain_ollama import ChatOllama
|
6
|
+
from langchain_openai import ChatOpenAI
|
7
|
+
from langfuse import Langfuse
|
8
|
+
|
9
|
+
from langchain_ocr_lib.impl.chains.ocr_chain import OcrChain
|
10
|
+
from langchain_ocr_lib.impl.settings.ollama_chat_settings import OllamaSettings
|
11
|
+
from langchain_ocr_lib.impl.settings.openai_chat_settings import OpenAISettings
|
12
|
+
from langchain_ocr_lib.impl.settings.llm_class_type_settings import LlmClassTypeSettings
|
13
|
+
from langchain_ocr_lib.impl.settings.langfuse_settings import LangfuseSettings
|
14
|
+
from langchain_ocr_lib.impl.settings.language_settings import LanguageSettings
|
15
|
+
from langchain_ocr_lib.impl.tracers.langfuse_traced_chain import LangfuseTracedChain
|
16
|
+
from langchain_ocr_lib.prompt_templates.ocr_prompt import ocr_prompt_template_builder
|
17
|
+
from langchain_ocr_lib.impl.llms.llm_factory import llm_provider
|
18
|
+
from langchain_ocr_lib.impl.langfuse_manager.langfuse_manager import LangfuseManager
|
19
|
+
from langchain_ocr_lib.impl.converter.pdf_converter import Pdf2MarkdownConverter
|
20
|
+
from langchain_ocr_lib.impl.converter.image_converter import Image2MarkdownConverter
|
21
|
+
|
22
|
+
|
23
|
+
def lib_di_config(binder: Binder):
|
24
|
+
"""Configure dependency injection bindings for the OCR library.
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
binder : Binder
|
29
|
+
The dependency injection binder instance used to register the bindings.
|
30
|
+
|
31
|
+
Raises
|
32
|
+
------
|
33
|
+
NotImplementedError
|
34
|
+
If the configured LLM type is not implemented.
|
35
|
+
|
36
|
+
"""
|
37
|
+
langfuse_settings = LangfuseSettings()
|
38
|
+
llm_class_type_settings = LlmClassTypeSettings()
|
39
|
+
language_settings = LanguageSettings()
|
40
|
+
|
41
|
+
if llm_class_type_settings.llm_type == "ollama":
|
42
|
+
settings = OllamaSettings()
|
43
|
+
llm_instance = llm_provider(settings, ChatOllama)
|
44
|
+
elif llm_class_type_settings.llm_type == "openai":
|
45
|
+
settings = OpenAISettings()
|
46
|
+
llm_instance = llm_provider(settings, ChatOpenAI)
|
47
|
+
else:
|
48
|
+
raise NotImplementedError("Configured LLM is not implemented")
|
49
|
+
binder.bind("LargeLanguageModel", llm_instance)
|
50
|
+
|
51
|
+
prompt = ocr_prompt_template_builder(language=language_settings.language, model_name=settings.model)
|
52
|
+
|
53
|
+
binder.bind(
|
54
|
+
"LangfuseClient",
|
55
|
+
Langfuse(
|
56
|
+
public_key=langfuse_settings.public_key,
|
57
|
+
secret_key=langfuse_settings.secret_key,
|
58
|
+
host=langfuse_settings.host,
|
59
|
+
),
|
60
|
+
)
|
61
|
+
|
62
|
+
binder.bind(
|
63
|
+
"LangfuseManager",
|
64
|
+
LangfuseManager(
|
65
|
+
managed_prompts={
|
66
|
+
OcrChain.__name__: prompt,
|
67
|
+
},
|
68
|
+
),
|
69
|
+
)
|
70
|
+
|
71
|
+
binder.bind("OcrChain", OcrChain())
|
72
|
+
|
73
|
+
binder.bind(
|
74
|
+
"LangfuseTracedChain",
|
75
|
+
LangfuseTracedChain(
|
76
|
+
settings=langfuse_settings,
|
77
|
+
),
|
78
|
+
)
|
79
|
+
|
80
|
+
binder.bind("PdfConverter", Pdf2MarkdownConverter())
|
81
|
+
binder.bind("ImageConverter", Image2MarkdownConverter())
|
82
|
+
|
83
|
+
|
84
|
+
def configure_di():
|
85
|
+
"""Configure dependency injection using the `inject` library."""
|
86
|
+
inject.configure(lib_di_config, allow_override=True, clear=True)
|
File without changes
|
File without changes
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""Module for LLM answer generation chain."""
|
2
|
+
|
3
|
+
from typing import Any, Optional
|
4
|
+
|
5
|
+
from langchain_core.runnables import Runnable, RunnableConfig
|
6
|
+
from langchain_core.runnables.utils import Input
|
7
|
+
import inject
|
8
|
+
|
9
|
+
from langchain_ocr_lib.chains.chain import Chain
|
10
|
+
|
11
|
+
RunnableInput = Input # TODO: adjust properly
|
12
|
+
RunnableOutput = str
|
13
|
+
|
14
|
+
|
15
|
+
class OcrChain(Chain[RunnableInput, RunnableOutput]):
|
16
|
+
"""Base class for LLM answer generation chain."""
|
17
|
+
|
18
|
+
_langfuse_manager = inject.attr("LangfuseManager")
|
19
|
+
|
20
|
+
def __init__(self):
|
21
|
+
"""Initialize the AnswerGenerationChain.
|
22
|
+
|
23
|
+
Parameters
|
24
|
+
----------
|
25
|
+
langfuse_manager : LangfuseManager
|
26
|
+
Manager instance for handling Langfuse operations and monitoring
|
27
|
+
"""
|
28
|
+
|
29
|
+
async def ainvoke(
|
30
|
+
self, chain_input: RunnableInput, config: Optional[RunnableConfig] = None, **kwargs: Any
|
31
|
+
) -> RunnableOutput:
|
32
|
+
"""
|
33
|
+
Asynchronously invokes the chain with given input.
|
34
|
+
|
35
|
+
Parameters
|
36
|
+
----------
|
37
|
+
chain_input : RunnableInput
|
38
|
+
The input to be processed by the chain.
|
39
|
+
chain_config : Optional[RunnableConfig]
|
40
|
+
Configuration for the chain execution (default None).
|
41
|
+
**kwargs : Any
|
42
|
+
Additional keyword arguments passed to the chain.
|
43
|
+
|
44
|
+
Returns
|
45
|
+
-------
|
46
|
+
RunnableOutput
|
47
|
+
The output generated by the chain.
|
48
|
+
|
49
|
+
Raises
|
50
|
+
------
|
51
|
+
ChainError
|
52
|
+
If an error occurs during chain execution.
|
53
|
+
"""
|
54
|
+
return await self._create_chain().ainvoke(chain_input, config=config)
|
55
|
+
|
56
|
+
def invoke(
|
57
|
+
self, chain_input: RunnableInput, config: Optional[RunnableConfig] = None, **kwargs: Any
|
58
|
+
) -> RunnableOutput:
|
59
|
+
"""
|
60
|
+
Invoke the chain with given input.
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
chain_input : RunnableInput
|
65
|
+
The input to be processed by the chain.
|
66
|
+
chain_config : Optional[RunnableConfig]
|
67
|
+
Configuration for the chain execution (default None).
|
68
|
+
**kwargs : Any
|
69
|
+
Additional keyword arguments passed to the chain.
|
70
|
+
|
71
|
+
Returns
|
72
|
+
-------
|
73
|
+
RunnableOutput
|
74
|
+
The output generated by the chain.
|
75
|
+
|
76
|
+
Raises
|
77
|
+
------
|
78
|
+
ChainError
|
79
|
+
If an error occurs during chain execution.
|
80
|
+
"""
|
81
|
+
return self._create_chain().invoke(chain_input, config=config)
|
82
|
+
|
83
|
+
def _create_chain(self) -> Runnable:
|
84
|
+
return self._langfuse_manager.get_base_prompt(self.__class__.__name__) | self._langfuse_manager.get_base_llm(
|
85
|
+
self.__class__.__name__
|
86
|
+
)
|
File without changes
|
@@ -0,0 +1,88 @@
|
|
1
|
+
"""Module for converting an image to markdown using a Langchain chain."""
|
2
|
+
|
3
|
+
import io
|
4
|
+
import base64
|
5
|
+
from PIL import Image
|
6
|
+
from PIL.ImageFile import ImageFile
|
7
|
+
|
8
|
+
from langchain_ocr_lib.converter.converter import File2MarkdownConverter
|
9
|
+
|
10
|
+
|
11
|
+
class Image2MarkdownConverter(File2MarkdownConverter):
|
12
|
+
"""Converts an image to markdown using a Langchain chain."""
|
13
|
+
|
14
|
+
async def aconvert2markdown(self, file: ImageFile | None = None, filename: str | None = None) -> str:
|
15
|
+
"""
|
16
|
+
Asynchronously converts an image to markdown using a Langchain chain.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
file : ImageFile | None, optional
|
21
|
+
PIL Image object to convert, by default None
|
22
|
+
filename : str | None, optional
|
23
|
+
Path to the image file to convert, by default None
|
24
|
+
|
25
|
+
Returns
|
26
|
+
-------
|
27
|
+
str
|
28
|
+
Markdown representation of the image.
|
29
|
+
|
30
|
+
Raises
|
31
|
+
------
|
32
|
+
ValueError
|
33
|
+
If no file or filename is provided.
|
34
|
+
ValueError
|
35
|
+
If the file is corrupted or the file type is unsupported.
|
36
|
+
"""
|
37
|
+
if file is None and filename is None:
|
38
|
+
raise ValueError("No file provided")
|
39
|
+
if file is None:
|
40
|
+
try:
|
41
|
+
file = Image.open(filename)
|
42
|
+
except Exception as e:
|
43
|
+
raise ValueError("Image corrupted or unsupported file type, %s" % e)
|
44
|
+
|
45
|
+
buf = io.BytesIO()
|
46
|
+
file.save(buf, format="PNG")
|
47
|
+
base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
|
48
|
+
response = await self._chain.ainvoke({"image_data": base64_img})
|
49
|
+
|
50
|
+
return response.content
|
51
|
+
|
52
|
+
def convert2markdown(self, file: ImageFile | None = None, filename: str | None = None) -> str:
|
53
|
+
"""
|
54
|
+
Convert an image to markdown using a Langchain chain.
|
55
|
+
|
56
|
+
Parameters
|
57
|
+
----------
|
58
|
+
file : ImageFile | None, optional
|
59
|
+
PIL Image object to convert, by default None
|
60
|
+
filename : str | None, optional
|
61
|
+
Path to the image file to convert, by default None
|
62
|
+
|
63
|
+
Returns
|
64
|
+
-------
|
65
|
+
str
|
66
|
+
Markdown representation of the image.
|
67
|
+
|
68
|
+
Raises
|
69
|
+
------
|
70
|
+
ValueError
|
71
|
+
If no file or filename is provided.
|
72
|
+
ValueError
|
73
|
+
If the file is corrupted or the file type is unsupported.
|
74
|
+
"""
|
75
|
+
if file is None and filename is None:
|
76
|
+
raise ValueError("No file provided")
|
77
|
+
if file is None:
|
78
|
+
try:
|
79
|
+
file = Image.open(filename)
|
80
|
+
except Exception as e:
|
81
|
+
raise ValueError("Image corrupted or unsupported file type, %s" % e)
|
82
|
+
|
83
|
+
buf = io.BytesIO()
|
84
|
+
file.save(buf, format="PNG")
|
85
|
+
base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
|
86
|
+
response = self._chain.invoke({"image_data": base64_img})
|
87
|
+
|
88
|
+
return response.content
|
@@ -0,0 +1,105 @@
|
|
1
|
+
"""Module for converting PDF files to markdown."""
|
2
|
+
|
3
|
+
from pdf2image import convert_from_bytes
|
4
|
+
import io
|
5
|
+
import base64
|
6
|
+
|
7
|
+
from langchain_ocr_lib.converter.converter import File2MarkdownConverter
|
8
|
+
|
9
|
+
|
10
|
+
class Pdf2MarkdownConverter(File2MarkdownConverter):
|
11
|
+
"""Converts PDF files to markdown format.
|
12
|
+
|
13
|
+
This class provides methods to convert PDF files, either provided as bytes or by filename,
|
14
|
+
into markdown format.
|
15
|
+
|
16
|
+
Attributes
|
17
|
+
----------
|
18
|
+
_chain : Chain
|
19
|
+
The OCR chain used to process images.
|
20
|
+
"""
|
21
|
+
|
22
|
+
async def aconvert2markdown(self, file: bytes | None = None, filename: str | None = None) -> str:
|
23
|
+
"""Asynchronously converts a PDF file (either provided as bytes or by filename) into markdown.
|
24
|
+
|
25
|
+
Parameters
|
26
|
+
----------
|
27
|
+
file : bytes, optional
|
28
|
+
The PDF file as bytes. Defaults to None.
|
29
|
+
filename : str, optional
|
30
|
+
The path to the PDF file. Defaults to None.
|
31
|
+
|
32
|
+
Returns
|
33
|
+
-------
|
34
|
+
str
|
35
|
+
The markdown representation of the PDF content extracted via OCR.
|
36
|
+
|
37
|
+
Raises
|
38
|
+
------
|
39
|
+
ValueError
|
40
|
+
If neither `file` nor `filename` is provided.
|
41
|
+
ValueError
|
42
|
+
If the PDF file is corrupted or the file type is unsupported.
|
43
|
+
"""
|
44
|
+
if file is None and filename is None:
|
45
|
+
raise ValueError("No file provided")
|
46
|
+
if file is None:
|
47
|
+
try:
|
48
|
+
with open(filename, "rb") as f:
|
49
|
+
file = f.read()
|
50
|
+
except Exception as e:
|
51
|
+
raise ValueError("PDF corrupted or unsupported file type, %s" % e)
|
52
|
+
|
53
|
+
images = convert_from_bytes(file)
|
54
|
+
|
55
|
+
markdown = ""
|
56
|
+
for image in images:
|
57
|
+
# Wrap the image in a Document if your chain expects it.
|
58
|
+
buf = io.BytesIO()
|
59
|
+
image.save(buf, format="PNG")
|
60
|
+
base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
|
61
|
+
response = await self._chain.ainvoke({"image_data": base64_img})
|
62
|
+
markdown += response.content
|
63
|
+
return markdown
|
64
|
+
|
65
|
+
def convert2markdown(self, file: bytes | None = None, filename: str | None = None) -> str:
|
66
|
+
"""Convert a PDF file (either provided as bytes or by filename) into markdown.
|
67
|
+
|
68
|
+
Parameters
|
69
|
+
----------
|
70
|
+
file : bytes, optional
|
71
|
+
The PDF file as bytes. Defaults to None.
|
72
|
+
filename : str, optional
|
73
|
+
The path to the PDF file. Defaults to None.
|
74
|
+
|
75
|
+
Returns
|
76
|
+
-------
|
77
|
+
str
|
78
|
+
The markdown representation of the PDF content extracted via OCR.
|
79
|
+
|
80
|
+
Raises
|
81
|
+
------
|
82
|
+
ValueError
|
83
|
+
If neither `file` nor `filename` is provided.
|
84
|
+
ValueError
|
85
|
+
If the PDF file is corrupted or the file type is unsupported.
|
86
|
+
"""
|
87
|
+
if file is None and filename is None:
|
88
|
+
raise ValueError("No file provided")
|
89
|
+
if file is None:
|
90
|
+
try:
|
91
|
+
with open(filename, "rb") as f:
|
92
|
+
file = f.read()
|
93
|
+
except Exception as e:
|
94
|
+
raise ValueError("PDF corrupted or unsupported file type") from e
|
95
|
+
|
96
|
+
images = convert_from_bytes(file)
|
97
|
+
|
98
|
+
markdown = ""
|
99
|
+
for image in images:
|
100
|
+
buf = io.BytesIO()
|
101
|
+
image.save(buf, format="PNG")
|
102
|
+
base64_img = base64.b64encode(buf.getvalue()).decode("utf-8")
|
103
|
+
response = self._chain.invoke({"image_data": base64_img})
|
104
|
+
markdown += response.content
|
105
|
+
return markdown
|
File without changes
|