kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +11 -1
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pdf.py +2 -3
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +0 -21
- kreuzberg/_ocr/_easyocr.py +51 -19
- kreuzberg/_ocr/_tesseract.py +14 -3
- kreuzberg/_types.py +111 -40
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/extraction.py +2 -2
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +12 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/RECORD +24 -23
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -8,8 +8,10 @@ from ._types import (
|
|
8
8
|
ExtractionConfig,
|
9
9
|
ExtractionResult,
|
10
10
|
GMFTConfig,
|
11
|
+
HTMLToMarkdownConfig,
|
11
12
|
ImageOCRConfig,
|
12
13
|
ImageOCRResult,
|
14
|
+
JSONExtractionConfig,
|
13
15
|
LanguageDetectionConfig,
|
14
16
|
Metadata,
|
15
17
|
PaddleOCRConfig,
|
@@ -40,8 +42,10 @@ __all__ = [
|
|
40
42
|
"ExtractionResult",
|
41
43
|
"ExtractorRegistry",
|
42
44
|
"GMFTConfig",
|
45
|
+
"HTMLToMarkdownConfig",
|
43
46
|
"ImageOCRConfig",
|
44
47
|
"ImageOCRResult",
|
48
|
+
"JSONExtractionConfig",
|
45
49
|
"KreuzbergError",
|
46
50
|
"LanguageDetectionConfig",
|
47
51
|
"Metadata",
|
kreuzberg/_api/main.py
CHANGED
@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
|
|
13
13
|
|
14
14
|
from kreuzberg import (
|
15
15
|
EasyOCRConfig,
|
16
|
-
ExtractedImage,
|
17
16
|
ExtractionConfig,
|
18
17
|
ExtractionResult,
|
19
|
-
ImageOCRResult,
|
20
18
|
KreuzbergError,
|
21
19
|
MissingDependencyError,
|
22
20
|
PaddleOCRConfig,
|
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
|
|
40
38
|
from litestar.datastructures import UploadFile
|
41
39
|
|
42
40
|
|
43
|
-
class ExtractedImageDict(TypedDict):
|
44
|
-
"""TypedDict for extracted image JSON representation."""
|
45
|
-
|
46
|
-
data: str
|
47
|
-
format: str
|
48
|
-
filename: str | None
|
49
|
-
page_number: int | None
|
50
|
-
dimensions: tuple[int, int] | None
|
51
|
-
colorspace: str | None
|
52
|
-
bits_per_component: int | None
|
53
|
-
is_mask: bool
|
54
|
-
description: str | None
|
55
|
-
|
56
|
-
|
57
|
-
class ImageOCRResultDict(TypedDict):
|
58
|
-
"""TypedDict for image OCR result JSON representation."""
|
59
|
-
|
60
|
-
image: ExtractedImageDict
|
61
|
-
ocr_result: Any
|
62
|
-
confidence_score: float | None
|
63
|
-
processing_time: float | None
|
64
|
-
skipped_reason: str | None
|
65
|
-
|
66
|
-
|
67
41
|
class HealthResponse(TypedDict):
|
68
42
|
"""Response model for health check endpoint."""
|
69
43
|
|
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
|
|
384
358
|
return f"data:image/png;base64,{img_str}"
|
385
359
|
|
386
360
|
|
387
|
-
def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
|
388
|
-
encoded_data = base64.b64encode(obj.data).decode()
|
389
|
-
return ExtractedImageDict(
|
390
|
-
data=f"data:image/{obj.format};base64,{encoded_data}",
|
391
|
-
format=obj.format,
|
392
|
-
filename=obj.filename,
|
393
|
-
page_number=obj.page_number,
|
394
|
-
dimensions=obj.dimensions,
|
395
|
-
colorspace=obj.colorspace,
|
396
|
-
bits_per_component=obj.bits_per_component,
|
397
|
-
is_mask=obj.is_mask,
|
398
|
-
description=obj.description,
|
399
|
-
)
|
400
|
-
|
401
|
-
|
402
|
-
def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
|
403
|
-
return ImageOCRResultDict(
|
404
|
-
image=_extracted_image_encoder(obj.image),
|
405
|
-
ocr_result=obj.ocr_result,
|
406
|
-
confidence_score=obj.confidence_score,
|
407
|
-
processing_time=obj.processing_time,
|
408
|
-
skipped_reason=obj.skipped_reason,
|
409
|
-
)
|
410
|
-
|
411
|
-
|
412
361
|
openapi_config = OpenAPIConfig(
|
413
362
|
title="Kreuzberg API",
|
414
363
|
version="3.14.0",
|
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
|
|
428
377
|
type_encoders = {
|
429
378
|
pl.DataFrame: _polars_dataframe_encoder,
|
430
379
|
Image.Image: _pil_image_encoder,
|
431
|
-
ExtractedImage: _extracted_image_encoder,
|
432
|
-
ImageOCRResult: _image_ocr_result_encoder,
|
433
380
|
}
|
434
381
|
|
435
382
|
app = Litestar(
|
kreuzberg/_config.py
CHANGED
@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
|
|
69
69
|
try:
|
70
70
|
match ocr_backend:
|
71
71
|
case "tesseract":
|
72
|
-
|
72
|
+
# Handle PSM mode conversion from int to enum
|
73
|
+
processed_args = backend_args.copy()
|
74
|
+
if "psm" in processed_args and isinstance(processed_args["psm"], int):
|
75
|
+
try:
|
76
|
+
processed_args["psm"] = PSMMode(processed_args["psm"])
|
77
|
+
except ValueError as e:
|
78
|
+
raise ValidationError(
|
79
|
+
f"Invalid PSM mode value: {processed_args['psm']}",
|
80
|
+
context={"psm_value": processed_args["psm"], "error": str(e)},
|
81
|
+
) from e
|
82
|
+
return TesseractConfig(**processed_args)
|
73
83
|
case "easyocr":
|
74
84
|
return EasyOCRConfig(**backend_args)
|
75
85
|
case "paddleocr":
|
@@ -132,7 +132,7 @@ def classify_document_from_layout(
|
|
132
132
|
if not found_words.is_empty():
|
133
133
|
scores[doc_type] += 1.0
|
134
134
|
word_top = found_words[0, "top"]
|
135
|
-
if word_top < page_height * 0.3:
|
135
|
+
if word_top is not None and word_top < page_height * 0.3:
|
136
136
|
scores[doc_type] += 0.5
|
137
137
|
|
138
138
|
total_score = sum(scores.values())
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
|
|
27
27
|
html2text = None
|
28
28
|
|
29
29
|
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
30
|
+
_UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
|
31
|
+
_UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
|
30
32
|
|
31
33
|
|
32
34
|
class EmailExtractor(Extractor):
|
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
|
|
86
88
|
def _format_email_field(self, field: Any) -> str:
|
87
89
|
match field:
|
88
90
|
case list():
|
89
|
-
|
91
|
+
emails = []
|
92
|
+
for item in field:
|
93
|
+
if isinstance(item, dict):
|
94
|
+
if email := item.get("email", ""):
|
95
|
+
emails.append(str(email))
|
96
|
+
else:
|
97
|
+
emails.append(str(item))
|
98
|
+
return ", ".join(emails)
|
90
99
|
case dict():
|
91
100
|
return str(field.get("email", ""))
|
92
101
|
case _:
|
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
|
|
111
120
|
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
|
112
121
|
clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
|
113
122
|
clean_html = unescape(clean_html)
|
114
|
-
clean_html = (
|
115
|
-
|
116
|
-
.replace("\u201d", '"')
|
117
|
-
.replace("\u2019", "'")
|
118
|
-
.replace("\u2018", "'")
|
119
|
-
)
|
123
|
+
clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
|
124
|
+
clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
|
120
125
|
text_parts.append(clean_html)
|
121
126
|
|
122
127
|
def _extract_email_attachments(
|
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
|
|
129
134
|
for att in attachments:
|
130
135
|
name_val: str = "unknown"
|
131
136
|
if isinstance(att, dict):
|
132
|
-
n = att.get("name")
|
137
|
+
n = att.get("name") or att.get("filename")
|
133
138
|
if isinstance(n, str) and n:
|
134
139
|
name_val = n
|
135
140
|
names.append(name_val)
|
136
|
-
metadata["attachments"] = names
|
137
141
|
if names:
|
142
|
+
metadata["attachments"] = names
|
138
143
|
text_parts.append("Attachments: " + ", ".join(names))
|
139
144
|
|
140
145
|
def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
|
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
|
|
151
156
|
if not isinstance(mime, str) or not mime.startswith("image/"):
|
152
157
|
continue
|
153
158
|
|
154
|
-
name = att.get("name")
|
159
|
+
name = att.get("name") or att.get("filename")
|
160
|
+
name = name if isinstance(name, str) else None
|
155
161
|
data = att.get("data") or att.get("content") or att.get("payload")
|
156
162
|
raw: bytes | None = None
|
157
163
|
if isinstance(data, (bytes, bytearray)):
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import binascii
|
5
|
+
import io
|
4
6
|
import logging
|
5
7
|
from typing import TYPE_CHECKING, ClassVar
|
6
8
|
|
7
9
|
import html_to_markdown
|
8
10
|
from anyio import Path as AsyncPath
|
9
11
|
from bs4 import BeautifulSoup
|
12
|
+
from PIL import Image
|
10
13
|
|
11
14
|
from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
|
12
15
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
13
16
|
from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
|
17
|
+
from kreuzberg._utils._html_streaming import should_use_streaming
|
14
18
|
from kreuzberg._utils._string import safe_decode
|
15
19
|
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
16
20
|
|
@@ -44,6 +48,11 @@ class HTMLExtractor(Extractor):
|
|
44
48
|
config_dict = config.to_dict()
|
45
49
|
|
46
50
|
html_content = safe_decode(content)
|
51
|
+
|
52
|
+
use_streaming, chunk_size = should_use_streaming(len(content))
|
53
|
+
config_dict["stream_processing"] = use_streaming
|
54
|
+
config_dict["chunk_size"] = chunk_size
|
55
|
+
|
47
56
|
result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
|
48
57
|
|
49
58
|
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
@@ -89,6 +98,13 @@ class HTMLExtractor(Extractor):
|
|
89
98
|
)
|
90
99
|
continue
|
91
100
|
|
101
|
+
dimensions = None
|
102
|
+
try:
|
103
|
+
with Image.open(io.BytesIO(image_data)) as pil_img:
|
104
|
+
dimensions = pil_img.size
|
105
|
+
except (OSError, ValueError) as e:
|
106
|
+
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
|
+
|
92
108
|
alt_val = img.get("alt") # type: ignore[union-attr]
|
93
109
|
desc = alt_val if isinstance(alt_val, str) else None
|
94
110
|
images.append(
|
@@ -97,25 +113,36 @@ class HTMLExtractor(Extractor):
|
|
97
113
|
format=format_name,
|
98
114
|
filename=f"embedded_image_{len(images) + 1}.{format_name}",
|
99
115
|
description=desc,
|
116
|
+
dimensions=dimensions,
|
100
117
|
)
|
101
118
|
)
|
102
|
-
except
|
119
|
+
except (ValueError, binascii.Error) as e:
|
103
120
|
logger.warning("Failed to extract base64 image: %s", e)
|
104
121
|
|
105
|
-
|
122
|
+
def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
|
106
123
|
try:
|
107
|
-
svg_content = str(
|
108
|
-
|
124
|
+
svg_content = str(svg_element).encode("utf-8")
|
125
|
+
|
126
|
+
def _get_attr_safe(obj: object, attr: str) -> str | None:
|
127
|
+
get_method = getattr(obj, "get", None)
|
128
|
+
if callable(get_method):
|
129
|
+
result = get_method(attr)
|
130
|
+
return result if isinstance(result, str) else None
|
131
|
+
return None
|
132
|
+
|
133
|
+
title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
|
109
134
|
desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
description=desc_svg,
|
116
|
-
)
|
135
|
+
return ExtractedImage(
|
136
|
+
data=svg_content,
|
137
|
+
format="svg",
|
138
|
+
filename=f"inline_svg_{len(images) + 1}.svg",
|
139
|
+
description=desc_svg,
|
117
140
|
)
|
118
|
-
except
|
141
|
+
except (UnicodeEncodeError, AttributeError) as e:
|
119
142
|
logger.warning("Failed to extract SVG: %s", e)
|
143
|
+
return None
|
144
|
+
|
145
|
+
svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
|
146
|
+
images.extend(img for img in svg_images if img is not None)
|
120
147
|
|
121
148
|
return images
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import asyncio
|
4
3
|
import contextlib
|
5
4
|
import io
|
6
5
|
import logging
|
@@ -41,7 +40,7 @@ from kreuzberg._utils._errors import create_error_context, should_retry
|
|
41
40
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
42
41
|
from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
|
43
42
|
from kreuzberg._utils._string import normalize_spaces
|
44
|
-
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
|
43
|
+
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
|
45
44
|
from kreuzberg._utils._table import generate_table_summary
|
46
45
|
from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
|
47
46
|
from kreuzberg.exceptions import ParsingError
|
@@ -231,7 +230,7 @@ class PDFExtractor(Extractor):
|
|
231
230
|
img_counter += 1
|
232
231
|
|
233
232
|
if tasks:
|
234
|
-
results = await
|
233
|
+
results = await run_taskgroup(*tasks)
|
235
234
|
return [img for img in results if img is not None]
|
236
235
|
|
237
236
|
return []
|
@@ -142,6 +142,8 @@ class PresentationExtractor(Extractor):
|
|
142
142
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
143
143
|
try:
|
144
144
|
image = shape.image
|
145
|
+
if not image.blob or not isinstance(image.blob, bytes):
|
146
|
+
continue
|
145
147
|
filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
|
146
148
|
|
147
149
|
images.append(
|
@@ -162,6 +164,8 @@ class PresentationExtractor(Extractor):
|
|
162
164
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
163
165
|
try:
|
164
166
|
image = shape.image
|
167
|
+
if not image.blob or not isinstance(image.blob, bytes):
|
168
|
+
continue
|
165
169
|
filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
|
166
170
|
images.append(
|
167
171
|
ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
|
@@ -197,7 +197,6 @@ class SpreadSheetExtractor(Extractor):
|
|
197
197
|
if not data or not any(row for row in data):
|
198
198
|
return f"## {sheet_name}\n\n*Empty sheet*"
|
199
199
|
|
200
|
-
# Normalize row lengths to avoid polars ShapeError
|
201
200
|
if data:
|
202
201
|
max_cols = max(len(row) if row else 0 for row in data)
|
203
202
|
data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
|
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import sys
|
5
4
|
from typing import TYPE_CHECKING, Any, ClassVar
|
6
5
|
|
@@ -17,11 +16,13 @@ try:
|
|
17
16
|
except ImportError: # pragma: no cover
|
18
17
|
yaml = None
|
19
18
|
|
19
|
+
|
20
20
|
from anyio import Path as AsyncPath
|
21
21
|
|
22
22
|
from kreuzberg._extractors._base import Extractor
|
23
23
|
from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
|
24
|
-
from kreuzberg._types import ExtractionResult, normalize_metadata
|
24
|
+
from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
|
25
|
+
from kreuzberg._utils._serialization import deserialize
|
25
26
|
from kreuzberg._utils._string import normalize_spaces, safe_decode
|
26
27
|
from kreuzberg._utils._sync import run_sync
|
27
28
|
|
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
|
|
43
44
|
"text/toml",
|
44
45
|
}
|
45
46
|
|
47
|
+
@property
|
48
|
+
def _json_config(self) -> JSONExtractionConfig | None:
|
49
|
+
return self.config.json_config
|
50
|
+
|
51
|
+
def _get_text_field_keywords(self) -> frozenset[str]:
|
52
|
+
json_config = self._json_config
|
53
|
+
if json_config and json_config.custom_text_field_patterns:
|
54
|
+
return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
|
55
|
+
return _TEXT_FIELD_KEYWORDS
|
56
|
+
|
57
|
+
def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
|
58
|
+
json_config = self._json_config
|
59
|
+
if not json_config or not json_config.extract_schema:
|
60
|
+
return {}
|
61
|
+
|
62
|
+
if depth >= json_config.max_depth:
|
63
|
+
return {"max_depth_reached": True}
|
64
|
+
|
65
|
+
schema_info: dict[str, Any] = {"type": type(data).__name__}
|
66
|
+
|
67
|
+
if isinstance(data, dict):
|
68
|
+
schema_info["properties"] = {}
|
69
|
+
for key, value in data.items():
|
70
|
+
key_path = f"{path}.{key}" if path else key
|
71
|
+
schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
|
72
|
+
elif isinstance(data, list) and data:
|
73
|
+
if len(data) <= json_config.array_item_limit:
|
74
|
+
schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
|
75
|
+
schema_info["length"] = len(data)
|
76
|
+
else:
|
77
|
+
schema_info["items"] = {"type": "truncated"}
|
78
|
+
schema_info["length"] = len(data)
|
79
|
+
schema_info["truncated"] = True
|
80
|
+
|
81
|
+
return schema_info
|
82
|
+
|
46
83
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
47
84
|
return await run_sync(self.extract_bytes_sync, content)
|
48
85
|
|
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
|
|
51
88
|
return await self.extract_bytes_async(content)
|
52
89
|
|
53
90
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
54
|
-
text_content =
|
55
|
-
|
91
|
+
text_content: None | str = None
|
56
92
|
try:
|
57
93
|
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
58
|
-
data = json
|
94
|
+
data = deserialize(content, dict, json=True)
|
59
95
|
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
96
|
+
text_content = safe_decode(content)
|
60
97
|
if tomllib is None:
|
61
98
|
return ExtractionResult(
|
62
99
|
content=normalize_spaces(text_content),
|
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
|
|
66
103
|
)
|
67
104
|
data = tomllib.loads(text_content)
|
68
105
|
else:
|
106
|
+
text_content = safe_decode(content)
|
69
107
|
if yaml is None:
|
70
108
|
return ExtractionResult(
|
71
109
|
content=normalize_spaces(text_content),
|
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
|
|
75
113
|
)
|
76
114
|
data = yaml.safe_load(text_content)
|
77
115
|
|
78
|
-
text_parts: list[str] = []
|
79
116
|
metadata: dict[str, Any] = {}
|
80
117
|
|
118
|
+
if (
|
119
|
+
self.mime_type in {JSON_MIME_TYPE, "text/json"}
|
120
|
+
and self._json_config
|
121
|
+
and self._json_config.extract_schema
|
122
|
+
):
|
123
|
+
schema_info = self._extract_json_schema(data)
|
124
|
+
if schema_info:
|
125
|
+
metadata["json_schema"] = schema_info
|
126
|
+
|
81
127
|
if isinstance(data, dict):
|
82
128
|
text_parts = self._extract_from_dict(data, metadata)
|
83
129
|
elif isinstance(data, list):
|
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
|
|
85
131
|
else:
|
86
132
|
text_parts = [str(data)]
|
87
133
|
|
88
|
-
combined_text = "\n".join(text_parts) if text_parts else text_content
|
134
|
+
combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
|
89
135
|
|
90
136
|
return ExtractionResult(
|
91
137
|
content=normalize_spaces(combined_text),
|
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
|
|
96
142
|
|
97
143
|
except (ValueError, TypeError) as e:
|
98
144
|
return ExtractionResult(
|
99
|
-
content=normalize_spaces(text_content),
|
145
|
+
content=normalize_spaces(text_content or safe_decode(content)),
|
100
146
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
101
147
|
metadata={"parse_error": str(e)},
|
102
148
|
chunks=[],
|
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
|
|
113
159
|
full_key = f"{prefix}.{key}" if prefix else key
|
114
160
|
|
115
161
|
if isinstance(value, str) and value.strip():
|
116
|
-
|
162
|
+
if self._json_config and self._json_config.include_type_info:
|
163
|
+
text_parts.append(f"{full_key} (string): {value}")
|
164
|
+
else:
|
165
|
+
text_parts.append(f"{full_key}: {value}")
|
117
166
|
|
118
167
|
key_lower = key.lower()
|
119
|
-
|
168
|
+
text_field_keywords = self._get_text_field_keywords()
|
169
|
+
if any(keyword in key_lower for keyword in text_field_keywords):
|
120
170
|
metadata[full_key] = value
|
121
171
|
|
122
172
|
elif isinstance(value, (int, float, bool)):
|
123
|
-
|
173
|
+
if self._json_config and self._json_config.include_type_info:
|
174
|
+
type_name = type(value).__name__
|
175
|
+
text_parts.append(f"{full_key} ({type_name}): {value}")
|
176
|
+
else:
|
177
|
+
text_parts.append(f"{full_key}: {value}")
|
124
178
|
|
125
179
|
elif isinstance(value, dict):
|
126
|
-
|
180
|
+
if self._json_config and not self._json_config.flatten_nested_objects:
|
181
|
+
text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
|
182
|
+
else:
|
183
|
+
text_parts.extend(self._extract_from_dict(value, metadata, full_key))
|
127
184
|
|
128
185
|
elif isinstance(value, list):
|
129
186
|
text_parts.extend(self._extract_from_list(value, metadata, full_key))
|
130
187
|
|
131
188
|
elif value is not None:
|
132
|
-
|
189
|
+
if self._json_config and self._json_config.include_type_info:
|
190
|
+
type_name = type(value).__name__
|
191
|
+
text_parts.append(f"{full_key} ({type_name}): {value!s}")
|
192
|
+
else:
|
193
|
+
text_parts.append(f"{full_key}: {value!s}")
|
133
194
|
|
134
195
|
return text_parts
|
135
196
|
|
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
|
|
140
201
|
item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
|
141
202
|
|
142
203
|
if isinstance(item, str) and item.strip():
|
143
|
-
|
204
|
+
if self._json_config and self._json_config.include_type_info:
|
205
|
+
text_parts.append(f"{item_key} (string): {item}")
|
206
|
+
else:
|
207
|
+
text_parts.append(f"{item_key}: {item}")
|
144
208
|
|
145
209
|
elif isinstance(item, dict):
|
146
210
|
text_parts.extend(self._extract_from_dict(item, metadata, item_key))
|
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
|
|
149
213
|
text_parts.extend(self._extract_from_list(item, metadata, item_key))
|
150
214
|
|
151
215
|
elif item is not None:
|
152
|
-
|
216
|
+
if self._json_config and self._json_config.include_type_info:
|
217
|
+
type_name = type(item).__name__
|
218
|
+
text_parts.append(f"{item_key} ({type_name}): {item!s}")
|
219
|
+
else:
|
220
|
+
text_parts.append(f"{item_key}: {item!s}")
|
153
221
|
|
154
222
|
return text_parts
|
kreuzberg/_gmft.py
CHANGED
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
|
|
312
312
|
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
313
313
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
314
314
|
|
315
|
+
if "cell_required_confidence" in config_dict:
|
316
|
+
cell_config = config_dict["cell_required_confidence"]
|
317
|
+
if isinstance(cell_config, dict) and cell_config:
|
318
|
+
config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
|
319
|
+
|
315
320
|
config = GMFTConfig(**config_dict)
|
316
321
|
|
317
322
|
formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
|