azure-form-recognizer-haystack 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure_form_recognizer_haystack-1.0.0.dist-info/METADATA +41 -0
- azure_form_recognizer_haystack-1.0.0.dist-info/RECORD +7 -0
- azure_form_recognizer_haystack-1.0.0.dist-info/WHEEL +4 -0
- azure_form_recognizer_haystack-1.0.0.dist-info/licenses/LICENSE +201 -0
- haystack_integrations/components/converters/azure_form_recognizer/__init__.py +7 -0
- haystack_integrations/components/converters/azure_form_recognizer/converter.py +477 -0
- haystack_integrations/components/converters/py.typed +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: azure-form-recognizer-haystack
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Haystack integration for Azure Document Intelligence using the Form Recognizer SDK
|
|
5
|
+
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/azure_form_recognizer#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
7
|
+
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/azure_form_recognizer
|
|
8
|
+
Author-email: deepset GmbH <info@deepset.ai>
|
|
9
|
+
License-Expression: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: Azure,Document Converter,Document Intelligence,Form Recognizer,Haystack,OCR,PDF
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: azure-ai-formrecognizer>=3.3.0
|
|
23
|
+
Requires-Dist: haystack-ai>=2.22.0
|
|
24
|
+
Requires-Dist: pandas>=2.3.3
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# azure-form-recognizer-haystack
|
|
28
|
+
|
|
29
|
+
[](https://pypi.org/project/azure-form-recognizer-haystack)
|
|
30
|
+
[](https://pypi.org/project/azure-form-recognizer-haystack)
|
|
31
|
+
|
|
32
|
+
- [Integration page](https://haystack.deepset.ai/integrations/azure-form-recognizer)
|
|
33
|
+
- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/azure_form_recognizer/CHANGELOG.md)
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Contributing
|
|
38
|
+
|
|
39
|
+
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
40
|
+
|
|
41
|
+
To run integration tests locally, you need to export the `CORE_AZURE_CS_ENDPOINT` and `CORE_AZURE_CS_API_KEY` environment variables.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
haystack_integrations/components/converters/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
haystack_integrations/components/converters/azure_form_recognizer/__init__.py,sha256=Kq-E_vkkr-r2ugYAgo4U8xZEkD_jDegDcFzhJmBNIGM,201
|
|
3
|
+
haystack_integrations/components/converters/azure_form_recognizer/converter.py,sha256=bm631hypNhSFgEFwFlrz7KiF1hOXG7y10zMunmHL-yM,23501
|
|
4
|
+
azure_form_recognizer_haystack-1.0.0.dist-info/METADATA,sha256=jdafihojmavWR1CM0FlkGo93v0TSsFGgXtv4p9Nl7sc,2191
|
|
5
|
+
azure_form_recognizer_haystack-1.0.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
6
|
+
azure_form_recognizer_haystack-1.0.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
|
|
7
|
+
azure_form_recognizer_haystack-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for reasonable and customary use in describing the
|
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
168
|
+
or other liability obligations and/or rights consistent with this
|
|
169
|
+
License. However, in accepting such obligations, You may act only
|
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
174
|
+
of your accepting any such warranty or additional liability.
|
|
175
|
+
|
|
176
|
+
END OF TERMS AND CONDITIONS
|
|
177
|
+
|
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
|
179
|
+
|
|
180
|
+
To apply the Apache License to your work, attach the following
|
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
182
|
+
replaced with your own identifying information. (Don't include
|
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
|
184
|
+
comment syntax for the file format. We also recommend that a
|
|
185
|
+
file or class name and description of purpose be included on the
|
|
186
|
+
same "printed page" as the copyright notice for easier
|
|
187
|
+
identification within third-party archives.
|
|
188
|
+
|
|
189
|
+
Copyright 2023 deepset GmbH
|
|
190
|
+
|
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
|
+
you may not use this file except in compliance with the License.
|
|
193
|
+
You may obtain a copy of the License at
|
|
194
|
+
|
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
196
|
+
|
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
200
|
+
See the License for the specific language governing permissions and
|
|
201
|
+
limitations under the License.
|
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
import os
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
import networkx as nx
|
|
12
|
+
from azure.ai.formrecognizer import AnalyzeResult, DocumentAnalysisClient, DocumentLine, DocumentParagraph
|
|
13
|
+
from azure.core.credentials import AzureKeyCredential
|
|
14
|
+
from haystack import Document, component, default_from_dict, default_to_dict, logging
|
|
15
|
+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
16
|
+
from haystack.dataclasses import ByteStream
|
|
17
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
18
|
+
from pandas import DataFrame
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@component
|
|
24
|
+
class AzureOCRDocumentConverter:
|
|
25
|
+
"""
|
|
26
|
+
Converts files to documents using Azure's Document Intelligence service.
|
|
27
|
+
|
|
28
|
+
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
|
|
29
|
+
|
|
30
|
+
To use this component, you need an active Azure account
|
|
31
|
+
and a Document Intelligence or Cognitive Services resource. For help with setting up your resource, see
|
|
32
|
+
[Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api).
|
|
33
|
+
|
|
34
|
+
### Usage example
|
|
35
|
+
```python
|
|
36
|
+
import os
|
|
37
|
+
from datetime import datetime
|
|
38
|
+
from haystack_integrations.components.converters.azure_form_recognizer import AzureOCRDocumentConverter
|
|
39
|
+
from haystack.utils import Secret
|
|
40
|
+
|
|
41
|
+
converter = AzureOCRDocumentConverter(
|
|
42
|
+
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"],
|
|
43
|
+
api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY"),
|
|
44
|
+
)
|
|
45
|
+
results = converter.run(
|
|
46
|
+
sources=["test/test_files/pdf/react_paper.pdf"],
|
|
47
|
+
meta={"date_added": datetime.now().isoformat()},
|
|
48
|
+
)
|
|
49
|
+
documents = results["documents"]
|
|
50
|
+
print(documents[0].content)
|
|
51
|
+
# 'This is a text from the PDF file.'
|
|
52
|
+
```
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
endpoint: str,
|
|
58
|
+
api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
|
|
59
|
+
model_id: str = "prebuilt-read",
|
|
60
|
+
preceding_context_len: int = 3,
|
|
61
|
+
following_context_len: int = 3,
|
|
62
|
+
merge_multiple_column_headers: bool = True,
|
|
63
|
+
page_layout: Literal["natural", "single_column"] = "natural",
|
|
64
|
+
threshold_y: float | None = 0.05,
|
|
65
|
+
store_full_path: bool = False,
|
|
66
|
+
) -> None:
|
|
67
|
+
"""
|
|
68
|
+
Creates an AzureOCRDocumentConverter component.
|
|
69
|
+
|
|
70
|
+
:param endpoint:
|
|
71
|
+
The endpoint of your Azure resource.
|
|
72
|
+
:param api_key:
|
|
73
|
+
The API key of your Azure resource.
|
|
74
|
+
:param model_id:
|
|
75
|
+
The ID of the model you want to use. For a list of available models, see [Azure documentation]
|
|
76
|
+
(https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature).
|
|
77
|
+
:param preceding_context_len: Number of lines before a table to include as preceding context
|
|
78
|
+
(this will be added to the metadata).
|
|
79
|
+
:param following_context_len: Number of lines after a table to include as subsequent context (
|
|
80
|
+
this will be added to the metadata).
|
|
81
|
+
:param merge_multiple_column_headers: If `True`, merges multiple column header rows into a single row.
|
|
82
|
+
:param page_layout: The type reading order to follow. Possible options:
|
|
83
|
+
- `natural`: Uses the natural reading order determined by Azure.
|
|
84
|
+
- `single_column`: Groups all lines with the same height on the page based on a threshold
|
|
85
|
+
determined by `threshold_y`.
|
|
86
|
+
:param threshold_y: Only relevant if `single_column` is set to `page_layout`.
|
|
87
|
+
The threshold, in inches, to determine if two recognized PDF elements are grouped into a
|
|
88
|
+
single line. This is crucial for section headers or numbers which may be spatially separated
|
|
89
|
+
from the remaining text on the horizontal axis.
|
|
90
|
+
:param store_full_path:
|
|
91
|
+
If True, the full path of the file is stored in the metadata of the document.
|
|
92
|
+
If False, only the file name is stored.
|
|
93
|
+
"""
|
|
94
|
+
self.document_analysis_client = DocumentAnalysisClient(
|
|
95
|
+
endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value() or "")
|
|
96
|
+
)
|
|
97
|
+
self.endpoint = endpoint
|
|
98
|
+
self.model_id = model_id
|
|
99
|
+
self.api_key = api_key
|
|
100
|
+
self.preceding_context_len = preceding_context_len
|
|
101
|
+
self.following_context_len = following_context_len
|
|
102
|
+
self.merge_multiple_column_headers = merge_multiple_column_headers
|
|
103
|
+
self.page_layout = page_layout
|
|
104
|
+
self.threshold_y = threshold_y
|
|
105
|
+
self.store_full_path = store_full_path
|
|
106
|
+
if self.page_layout == "single_column" and self.threshold_y is None:
|
|
107
|
+
self.threshold_y = 0.05
|
|
108
|
+
|
|
109
|
+
@component.output_types(documents=list[Document], raw_azure_response=list[dict])
|
|
110
|
+
def run(
|
|
111
|
+
self, sources: list[str | Path | ByteStream], meta: dict[str, Any] | list[dict[str, Any]] | None = None
|
|
112
|
+
) -> dict[str, Any]:
|
|
113
|
+
"""
|
|
114
|
+
Convert a list of files to Documents using Azure's Document Intelligence service.
|
|
115
|
+
|
|
116
|
+
:param sources:
|
|
117
|
+
List of file paths or ByteStream objects.
|
|
118
|
+
:param meta:
|
|
119
|
+
Optional metadata to attach to the Documents.
|
|
120
|
+
This value can be either a list of dictionaries or a single dictionary.
|
|
121
|
+
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
|
122
|
+
If it's a list, the length of the list must match the number of sources, because the two lists will be
|
|
123
|
+
zipped. If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
|
|
124
|
+
|
|
125
|
+
:returns:
|
|
126
|
+
A dictionary with the following keys:
|
|
127
|
+
- `documents`: List of created Documents
|
|
128
|
+
- `raw_azure_response`: List of raw Azure responses used to create the Documents
|
|
129
|
+
"""
|
|
130
|
+
documents = []
|
|
131
|
+
azure_output = []
|
|
132
|
+
meta_list: list[dict[str, Any]] = normalize_metadata(meta=meta, sources_count=len(sources))
|
|
133
|
+
for source, metadata in zip(sources, meta_list, strict=True):
|
|
134
|
+
try:
|
|
135
|
+
bytestream = get_bytestream_from_source(source=source)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
poller = self.document_analysis_client.begin_analyze_document(
|
|
141
|
+
model_id=self.model_id, document=bytestream.data
|
|
142
|
+
)
|
|
143
|
+
result = poller.result()
|
|
144
|
+
azure_output.append(result.to_dict())
|
|
145
|
+
|
|
146
|
+
merged_metadata = {**bytestream.meta, **metadata}
|
|
147
|
+
|
|
148
|
+
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
|
|
149
|
+
merged_metadata["file_path"] = os.path.basename(file_path)
|
|
150
|
+
docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
|
|
151
|
+
documents.extend(docs)
|
|
152
|
+
|
|
153
|
+
return {"documents": documents, "raw_azure_response": azure_output}
|
|
154
|
+
|
|
155
|
+
def to_dict(self) -> dict[str, Any]:
|
|
156
|
+
"""
|
|
157
|
+
Serializes the component to a dictionary.
|
|
158
|
+
|
|
159
|
+
:returns:
|
|
160
|
+
Dictionary with serialized data.
|
|
161
|
+
"""
|
|
162
|
+
return default_to_dict(
|
|
163
|
+
self,
|
|
164
|
+
api_key=self.api_key.to_dict(),
|
|
165
|
+
endpoint=self.endpoint,
|
|
166
|
+
model_id=self.model_id,
|
|
167
|
+
preceding_context_len=self.preceding_context_len,
|
|
168
|
+
following_context_len=self.following_context_len,
|
|
169
|
+
merge_multiple_column_headers=self.merge_multiple_column_headers,
|
|
170
|
+
page_layout=self.page_layout,
|
|
171
|
+
threshold_y=self.threshold_y,
|
|
172
|
+
store_full_path=self.store_full_path,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@classmethod
|
|
176
|
+
def from_dict(cls, data: dict[str, Any]) -> "AzureOCRDocumentConverter":
|
|
177
|
+
"""
|
|
178
|
+
Deserializes the component from a dictionary.
|
|
179
|
+
|
|
180
|
+
:param data:
|
|
181
|
+
The dictionary to deserialize from.
|
|
182
|
+
:returns:
|
|
183
|
+
The deserialized component.
|
|
184
|
+
"""
|
|
185
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
186
|
+
return default_from_dict(cls, data)
|
|
187
|
+
|
|
188
|
+
def _convert_tables_and_text(self, result: AnalyzeResult, meta: dict[str, Any] | None) -> list[Document]:
|
|
189
|
+
"""
|
|
190
|
+
Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents.
|
|
191
|
+
|
|
192
|
+
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
|
|
193
|
+
can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
|
|
194
|
+
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
|
195
|
+
Can be any custom keys and values.
|
|
196
|
+
:returns: List of Documents containing the tables and text extracted from the AnalyzeResult object.
|
|
197
|
+
"""
|
|
198
|
+
tables = self._convert_tables(result=result, meta=meta)
|
|
199
|
+
if self.page_layout == "natural":
|
|
200
|
+
text = self._convert_to_natural_text(result=result, meta=meta)
|
|
201
|
+
else:
|
|
202
|
+
threshold_y = self.threshold_y if self.threshold_y is not None else 0.05
|
|
203
|
+
text = self._convert_to_single_column_text(result=result, meta=meta, threshold_y=threshold_y)
|
|
204
|
+
return [*tables, text]
|
|
205
|
+
|
|
206
|
+
def _convert_tables(self, result: AnalyzeResult, meta: dict[str, Any] | None) -> list[Document]:
|
|
207
|
+
"""
|
|
208
|
+
Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents.
|
|
209
|
+
|
|
210
|
+
:param result: The AnalyzeResult Azure object
|
|
211
|
+
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
|
212
|
+
|
|
213
|
+
:returns: List of Documents containing the tables extracted from the AnalyzeResult object.
|
|
214
|
+
"""
|
|
215
|
+
converted_tables: list[Document] = []
|
|
216
|
+
|
|
217
|
+
if not result.tables:
|
|
218
|
+
return converted_tables
|
|
219
|
+
|
|
220
|
+
for table in result.tables:
|
|
221
|
+
# Initialize table with empty cells
|
|
222
|
+
table_list = [[""] * table.column_count for _ in range(table.row_count)]
|
|
223
|
+
additional_column_header_rows = set()
|
|
224
|
+
caption = ""
|
|
225
|
+
row_idx_start = 0
|
|
226
|
+
|
|
227
|
+
for idx, cell in enumerate(table.cells):
|
|
228
|
+
# Remove ':selected:'/':unselected:' tags from cell's content
|
|
229
|
+
cell.content = cell.content.replace(":selected:", "")
|
|
230
|
+
cell.content = cell.content.replace(":unselected:", "")
|
|
231
|
+
|
|
232
|
+
# Check if first row is a merged cell spanning whole table
|
|
233
|
+
# -> exclude this row and use as a caption
|
|
234
|
+
if idx == 0 and cell.column_span == table.column_count:
|
|
235
|
+
caption = cell.content
|
|
236
|
+
row_idx_start = 1
|
|
237
|
+
table_list.pop(0)
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
column_span = cell.column_span if cell.column_span else 0
|
|
241
|
+
for c in range(column_span):
|
|
242
|
+
row_span = cell.row_span if cell.row_span else 0
|
|
243
|
+
for r in range(row_span):
|
|
244
|
+
if (
|
|
245
|
+
self.merge_multiple_column_headers
|
|
246
|
+
and cell.kind == "columnHeader"
|
|
247
|
+
and cell.row_index > row_idx_start
|
|
248
|
+
):
|
|
249
|
+
# More than one row serves as column header
|
|
250
|
+
table_list[0][cell.column_index + c] += f"\n{cell.content}"
|
|
251
|
+
additional_column_header_rows.add(cell.row_index - row_idx_start)
|
|
252
|
+
else:
|
|
253
|
+
table_list[cell.row_index + r - row_idx_start][cell.column_index + c] = cell.content
|
|
254
|
+
|
|
255
|
+
# Remove additional column header rows, as these got attached to the first row
|
|
256
|
+
for row_idx in sorted(additional_column_header_rows, reverse=True):
|
|
257
|
+
del table_list[row_idx]
|
|
258
|
+
|
|
259
|
+
# Get preceding context of table
|
|
260
|
+
if table.bounding_regions:
|
|
261
|
+
table_beginning_page = next(
|
|
262
|
+
page for page in result.pages if page.page_number == table.bounding_regions[0].page_number
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
table_beginning_page = None
|
|
266
|
+
table_start_offset = table.spans[0].offset
|
|
267
|
+
if table_beginning_page and table_beginning_page.lines:
|
|
268
|
+
preceding_lines = [
|
|
269
|
+
line.content for line in table_beginning_page.lines if line.spans[0].offset < table_start_offset
|
|
270
|
+
]
|
|
271
|
+
else:
|
|
272
|
+
preceding_lines = []
|
|
273
|
+
preceding_context = "\n".join(preceding_lines[-self.preceding_context_len :]) + f"\n{caption}"
|
|
274
|
+
preceding_context = preceding_context.strip()
|
|
275
|
+
|
|
276
|
+
# Get following context
|
|
277
|
+
if table.bounding_regions and len(table.bounding_regions) == 1:
|
|
278
|
+
table_end_page = table_beginning_page
|
|
279
|
+
elif table.bounding_regions:
|
|
280
|
+
table_end_page = next(
|
|
281
|
+
page for page in result.pages if page.page_number == table.bounding_regions[-1].page_number
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
table_end_page = None
|
|
285
|
+
|
|
286
|
+
table_end_offset = table_start_offset + table.spans[0].length
|
|
287
|
+
if table_end_page and table_end_page.lines:
|
|
288
|
+
following_lines = [
|
|
289
|
+
line.content for line in table_end_page.lines if line.spans[0].offset > table_end_offset
|
|
290
|
+
]
|
|
291
|
+
else:
|
|
292
|
+
following_lines = []
|
|
293
|
+
following_context = "\n".join(following_lines[: self.following_context_len])
|
|
294
|
+
|
|
295
|
+
table_meta = copy.deepcopy(meta)
|
|
296
|
+
|
|
297
|
+
if isinstance(table_meta, dict):
|
|
298
|
+
table_meta["preceding_context"] = preceding_context
|
|
299
|
+
table_meta["following_context"] = following_context
|
|
300
|
+
else:
|
|
301
|
+
table_meta = {"preceding_context": preceding_context, "following_context": following_context}
|
|
302
|
+
|
|
303
|
+
if table.bounding_regions:
|
|
304
|
+
table_meta["page"] = table.bounding_regions[0].page_number
|
|
305
|
+
|
|
306
|
+
# Convert table to CSV
|
|
307
|
+
table_df = DataFrame(data=table_list)
|
|
308
|
+
table_content = table_df.to_csv(header=False, index=False, lineterminator="\n")
|
|
309
|
+
converted_tables.append(Document(content=table_content, meta=table_meta))
|
|
310
|
+
|
|
311
|
+
return converted_tables
|
|
312
|
+
|
|
313
|
+
def _convert_to_natural_text(self, result: AnalyzeResult, meta: dict[str, Any] | None) -> Document:
|
|
314
|
+
"""
|
|
315
|
+
This converts the `AnalyzeResult` object into a single document.
|
|
316
|
+
|
|
317
|
+
We add "\f" separators between to differentiate between the text on separate pages. This is the expected format
|
|
318
|
+
for the PreProcessor.
|
|
319
|
+
|
|
320
|
+
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
|
|
321
|
+
can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
|
|
322
|
+
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
|
323
|
+
Can be any custom keys and values.
|
|
324
|
+
:returns: A single Document containing all the text extracted from the AnalyzeResult object.
|
|
325
|
+
"""
|
|
326
|
+
table_spans_by_page = self._collect_table_spans(result=result)
|
|
327
|
+
|
|
328
|
+
texts = []
|
|
329
|
+
if result.paragraphs:
|
|
330
|
+
paragraphs_to_pages: dict[int, str] = defaultdict(str)
|
|
331
|
+
for paragraph in result.paragraphs:
|
|
332
|
+
if paragraph.bounding_regions:
|
|
333
|
+
# If paragraph spans multiple pages we group it with the first page number
|
|
334
|
+
page_numbers = [b.page_number for b in paragraph.bounding_regions]
|
|
335
|
+
else:
|
|
336
|
+
# If page_number is not available we put the paragraph onto an existing page
|
|
337
|
+
current_last_page_number = sorted(paragraphs_to_pages.keys())[-1] if paragraphs_to_pages else 1
|
|
338
|
+
page_numbers = [current_last_page_number]
|
|
339
|
+
tables_on_page = table_spans_by_page[page_numbers[0]]
|
|
340
|
+
# Check if paragraph is part of a table and if so skip
|
|
341
|
+
if self._check_if_in_table(tables_on_page, line_or_paragraph=paragraph):
|
|
342
|
+
continue
|
|
343
|
+
paragraphs_to_pages[page_numbers[0]] += paragraph.content + "\n"
|
|
344
|
+
|
|
345
|
+
max_page_number: int = max(paragraphs_to_pages)
|
|
346
|
+
for page_idx in range(1, max_page_number + 1):
|
|
347
|
+
# We add empty strings for missing pages so the preprocessor can still extract the correct page number
|
|
348
|
+
# from the original PDF.
|
|
349
|
+
page_text = paragraphs_to_pages.get(page_idx, "")
|
|
350
|
+
texts.append(page_text)
|
|
351
|
+
else:
|
|
352
|
+
logger.warning("No text paragraphs were detected by the OCR conversion.")
|
|
353
|
+
|
|
354
|
+
all_text = "\f".join(texts)
|
|
355
|
+
return Document(content=all_text, meta=meta if meta else {})
|
|
356
|
+
|
|
357
|
+
def _convert_to_single_column_text(
|
|
358
|
+
self, result: AnalyzeResult, meta: dict[str, str] | None, threshold_y: float = 0.05
|
|
359
|
+
) -> Document:
|
|
360
|
+
"""
|
|
361
|
+
This converts the `AnalyzeResult` object into a single Haystack Document.
|
|
362
|
+
|
|
363
|
+
We add "\f" separators between to differentiate between the text on separate pages. This is the expected format
|
|
364
|
+
for the PreProcessor.
|
|
365
|
+
|
|
366
|
+
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
|
|
367
|
+
can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
|
|
368
|
+
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
|
369
|
+
Can be any custom keys and values.
|
|
370
|
+
:param threshold_y: height threshold in inches for PDF and pixels for images
|
|
371
|
+
:returns: A single Document containing all the text extracted from the AnalyzeResult object.
|
|
372
|
+
"""
|
|
373
|
+
table_spans_by_page = self._collect_table_spans(result=result)
|
|
374
|
+
|
|
375
|
+
# Find all pairs of lines that should be grouped together based on the y-value of the upper left coordinate
|
|
376
|
+
# of their bounding box
|
|
377
|
+
pairs_by_page = defaultdict(list)
|
|
378
|
+
for page_idx, page in enumerate(result.pages):
|
|
379
|
+
lines = page.lines if page.lines else []
|
|
380
|
+
# Only works if polygons is available
|
|
381
|
+
if all(line.polygon is not None for line in lines):
|
|
382
|
+
for i in range(len(lines)):
|
|
383
|
+
# left_upi, right_upi, right_lowi, left_lowi = lines[i].polygon
|
|
384
|
+
left_upi, _, _, _ = lines[i].polygon
|
|
385
|
+
pairs_by_page[page_idx].append([i, i])
|
|
386
|
+
for j in range(i + 1, len(lines)):
|
|
387
|
+
left_upj, _, _, _ = lines[j].polygon
|
|
388
|
+
close_on_y_axis = abs(left_upi[1] - left_upj[1]) < threshold_y
|
|
389
|
+
if close_on_y_axis:
|
|
390
|
+
pairs_by_page[page_idx].append([i, j])
|
|
391
|
+
# Default if polygon is not available
|
|
392
|
+
else:
|
|
393
|
+
logger.info(
|
|
394
|
+
"Polygon information for lines on page {page_idx} is not available so it is not possible "
|
|
395
|
+
"to enforce a single column page layout.",
|
|
396
|
+
page_idx=page_idx,
|
|
397
|
+
)
|
|
398
|
+
for i in range(len(lines)):
|
|
399
|
+
pairs_by_page[page_idx].append([i, i])
|
|
400
|
+
|
|
401
|
+
# merged the line pairs that are connected by page
|
|
402
|
+
merged_pairs_by_page = {}
|
|
403
|
+
for page_idx in pairs_by_page:
|
|
404
|
+
graph = nx.Graph()
|
|
405
|
+
graph.add_edges_from(pairs_by_page[page_idx])
|
|
406
|
+
merged_pairs_by_page[page_idx] = [list(a) for a in list(nx.connected_components(graph))]
|
|
407
|
+
|
|
408
|
+
# Convert line indices to the DocumentLine objects
|
|
409
|
+
merged_lines_by_page = {}
|
|
410
|
+
for page_idx, page in enumerate(result.pages):
|
|
411
|
+
rows = []
|
|
412
|
+
lines = page.lines if page.lines else []
|
|
413
|
+
# We use .get(page_idx, []) since the page could be empty
|
|
414
|
+
for row_of_lines in merged_pairs_by_page.get(page_idx, []):
|
|
415
|
+
lines_in_row = [lines[line_idx] for line_idx in row_of_lines]
|
|
416
|
+
rows.append(lines_in_row)
|
|
417
|
+
merged_lines_by_page[page_idx] = rows
|
|
418
|
+
|
|
419
|
+
# Sort the merged pairs in each row by the x-value of the upper left bounding box coordinate
|
|
420
|
+
x_sorted_lines_by_page = {}
|
|
421
|
+
for page_idx, _ in enumerate(result.pages):
|
|
422
|
+
sorted_rows = []
|
|
423
|
+
for row_of_lines in merged_lines_by_page[page_idx]:
|
|
424
|
+
sorted_rows.append(sorted(row_of_lines, key=lambda x: x.polygon[0][0]))
|
|
425
|
+
x_sorted_lines_by_page[page_idx] = sorted_rows
|
|
426
|
+
|
|
427
|
+
# Sort each row within the page by the y-value of the upper left bounding box coordinate
|
|
428
|
+
y_sorted_lines_by_page = {}
|
|
429
|
+
for page_idx, _ in enumerate(result.pages):
|
|
430
|
+
sorted_rows = sorted(x_sorted_lines_by_page[page_idx], key=lambda x: x[0].polygon[0][1])
|
|
431
|
+
y_sorted_lines_by_page[page_idx] = sorted_rows
|
|
432
|
+
|
|
433
|
+
# Construct the text to write
|
|
434
|
+
texts = []
|
|
435
|
+
for page_idx, page in enumerate(result.pages):
|
|
436
|
+
tables_on_page = table_spans_by_page[page.page_number]
|
|
437
|
+
page_text = ""
|
|
438
|
+
for row_of_lines in y_sorted_lines_by_page[page_idx]:
|
|
439
|
+
# Check if line is part of a table and if so skip
|
|
440
|
+
if any(self._check_if_in_table(tables_on_page, line_or_paragraph=line) for line in row_of_lines):
|
|
441
|
+
continue
|
|
442
|
+
page_text += " ".join(line.content for line in row_of_lines)
|
|
443
|
+
page_text += "\n"
|
|
444
|
+
texts.append(page_text)
|
|
445
|
+
all_text = "\f".join(texts)
|
|
446
|
+
return Document(content=all_text, meta=meta if meta else {})
|
|
447
|
+
|
|
448
|
+
def _collect_table_spans(self, result: AnalyzeResult) -> dict:
|
|
449
|
+
"""
|
|
450
|
+
Collect the spans of all tables by page number.
|
|
451
|
+
|
|
452
|
+
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
|
|
453
|
+
:returns: A dictionary with the page number as key and a list of table spans as value.
|
|
454
|
+
"""
|
|
455
|
+
table_spans_by_page = defaultdict(list)
|
|
456
|
+
tables = result.tables if result.tables else []
|
|
457
|
+
for table in tables:
|
|
458
|
+
if not table.bounding_regions:
|
|
459
|
+
continue
|
|
460
|
+
table_spans_by_page[table.bounding_regions[0].page_number].append(table.spans[0])
|
|
461
|
+
return table_spans_by_page
|
|
462
|
+
|
|
463
|
+
def _check_if_in_table(self, tables_on_page: dict, line_or_paragraph: DocumentLine | DocumentParagraph) -> bool:
|
|
464
|
+
"""
|
|
465
|
+
Check if a line or paragraph is part of a table.
|
|
466
|
+
|
|
467
|
+
:param tables_on_page: A dictionary with the page number as key and a list of table spans as value.
|
|
468
|
+
:param line_or_paragraph: The line or paragraph to check.
|
|
469
|
+
:returns: True if the line or paragraph is part of a table, False otherwise.
|
|
470
|
+
"""
|
|
471
|
+
in_table = False
|
|
472
|
+
# Check if line is part of a table
|
|
473
|
+
for table in tables_on_page:
|
|
474
|
+
if table.offset <= line_or_paragraph.spans[0].offset <= table.offset + table.length:
|
|
475
|
+
in_table = True
|
|
476
|
+
break
|
|
477
|
+
return in_table
|
|
File without changes
|