kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,3 @@
|
|
1
|
-
"""This module provides functions to extract textual content from files.
|
2
|
-
|
3
|
-
It includes vendored code:
|
4
|
-
|
5
|
-
- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
|
6
|
-
See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
|
7
|
-
Refer to the markitdown repository for it's license (MIT).
|
8
|
-
"""
|
9
|
-
|
10
1
|
from __future__ import annotations
|
11
2
|
|
12
3
|
import re
|
@@ -30,99 +21,27 @@ if TYPE_CHECKING: # pragma: no cover
|
|
30
21
|
|
31
22
|
from kreuzberg._types import Metadata
|
32
23
|
|
33
|
-
# Pre-compiled regex patterns for performance
|
34
24
|
_NON_WORD_PATTERN = re.compile(r"\W")
|
35
25
|
|
36
26
|
|
37
27
|
class PresentationExtractor(Extractor):
|
38
|
-
"""Extractor for PowerPoint (.pptx) files.
|
39
|
-
|
40
|
-
This extractor processes PowerPoint presentations and converts their content into Markdown format.
|
41
|
-
It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
|
42
|
-
of the presentation in a readable text format.
|
43
|
-
|
44
|
-
The extractor provides both synchronous and asynchronous methods for processing files either
|
45
|
-
from disk or from bytes in memory.
|
46
|
-
"""
|
47
|
-
|
48
28
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
|
49
29
|
|
50
30
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
51
|
-
"""Asynchronously extract content from PowerPoint file bytes.
|
52
|
-
|
53
|
-
Args:
|
54
|
-
content: Raw bytes of the PowerPoint file to process.
|
55
|
-
|
56
|
-
Returns:
|
57
|
-
ExtractionResult: Contains the extracted content in Markdown format,
|
58
|
-
the MIME type, and any additional metadata.
|
59
|
-
"""
|
60
31
|
return self._extract_pptx(content)
|
61
32
|
|
62
33
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
63
|
-
"""Asynchronously extract content from a PowerPoint file on disk.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
path: Path to the PowerPoint file to process.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
ExtractionResult: Contains the extracted content in Markdown format,
|
70
|
-
the MIME type, and any additional metadata.
|
71
|
-
"""
|
72
34
|
content = await AsyncPath(path).read_bytes()
|
73
35
|
return self._extract_pptx(content)
|
74
36
|
|
75
37
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
76
|
-
"""Synchronously extract content from PowerPoint file bytes.
|
77
|
-
|
78
|
-
Args:
|
79
|
-
content: Raw bytes of the PowerPoint file to process.
|
80
|
-
|
81
|
-
Returns:
|
82
|
-
ExtractionResult: Contains the extracted content in Markdown format,
|
83
|
-
the MIME type, and any additional metadata.
|
84
|
-
"""
|
85
38
|
return self._extract_pptx(content)
|
86
39
|
|
87
40
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
88
|
-
"""Synchronously extract content from a PowerPoint file on disk.
|
89
|
-
|
90
|
-
Args:
|
91
|
-
path: Path to the PowerPoint file to process.
|
92
|
-
|
93
|
-
Returns:
|
94
|
-
ExtractionResult: Contains the extracted content in Markdown format,
|
95
|
-
the MIME type, and any additional metadata.
|
96
|
-
"""
|
97
41
|
content = Path(path).read_bytes()
|
98
42
|
return self._extract_pptx(content)
|
99
43
|
|
100
44
|
def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
|
101
|
-
"""Process PowerPoint file contents and convert to Markdown.
|
102
|
-
|
103
|
-
This method handles the core logic of extracting content from a PowerPoint file.
|
104
|
-
It processes:
|
105
|
-
- Slide titles and content
|
106
|
-
- Images (with alt text if available)
|
107
|
-
- Tables (converted to HTML format)
|
108
|
-
- Text frames
|
109
|
-
- Slide notes
|
110
|
-
|
111
|
-
Args:
|
112
|
-
file_contents: Raw bytes of the PowerPoint file to process.
|
113
|
-
|
114
|
-
Returns:
|
115
|
-
ExtractionResult: Contains the extracted content in Markdown format,
|
116
|
-
the MIME type, and any additional metadata.
|
117
|
-
|
118
|
-
Notes:
|
119
|
-
The extraction preserves the following elements:
|
120
|
-
- Slide numbers (as HTML comments)
|
121
|
-
- Images (converted to Markdown image syntax with alt text)
|
122
|
-
- Tables (converted to HTML table syntax)
|
123
|
-
- Text content (with titles properly formatted)
|
124
|
-
- Slide notes (under a dedicated section for each slide)
|
125
|
-
"""
|
126
45
|
md_content = ""
|
127
46
|
presentation = pptx.Presentation(BytesIO(file_contents))
|
128
47
|
|
@@ -191,33 +110,20 @@ class PresentationExtractor(Extractor):
|
|
191
110
|
|
192
111
|
@staticmethod
|
193
112
|
def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
|
194
|
-
"""Extract metadata from a presentation instance.
|
195
|
-
|
196
|
-
Args:
|
197
|
-
presentation: A `Presentation` object representing the PowerPoint file.
|
198
|
-
|
199
|
-
Returns:
|
200
|
-
PresentationMetadata: Object containing presentation-specific metadata fields.
|
201
|
-
"""
|
202
113
|
metadata: Metadata = {}
|
203
114
|
|
204
|
-
# Extract core properties
|
205
115
|
PresentationExtractor._extract_core_properties(presentation, metadata)
|
206
116
|
|
207
|
-
# Extract fonts used in presentation
|
208
117
|
fonts = PresentationExtractor._extract_fonts(presentation)
|
209
118
|
if fonts:
|
210
119
|
metadata["fonts"] = list(fonts)
|
211
120
|
|
212
|
-
# Add structural information
|
213
121
|
PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
|
214
122
|
|
215
123
|
return metadata
|
216
124
|
|
217
125
|
@staticmethod
|
218
126
|
def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
|
219
|
-
"""Extract core document properties from presentation."""
|
220
|
-
# Property mapping for core metadata
|
221
127
|
property_mapping = [
|
222
128
|
("authors", "author"),
|
223
129
|
("comments", "comments"),
|
@@ -236,7 +142,6 @@ class PresentationExtractor(Extractor):
|
|
236
142
|
if core_property := getattr(presentation.core_properties, core_property_key, None):
|
237
143
|
metadata[metadata_key] = core_property # type: ignore[literal-required]
|
238
144
|
|
239
|
-
# Handle special list properties
|
240
145
|
if presentation.core_properties.language:
|
241
146
|
metadata["languages"] = [presentation.core_properties.language]
|
242
147
|
|
@@ -245,7 +150,6 @@ class PresentationExtractor(Extractor):
|
|
245
150
|
|
246
151
|
@staticmethod
|
247
152
|
def _extract_fonts(presentation: Presentation) -> set[str]:
|
248
|
-
"""Extract all fonts used in the presentation."""
|
249
153
|
fonts = set()
|
250
154
|
for slide in presentation.slides:
|
251
155
|
for shape in slide.shapes:
|
@@ -260,12 +164,10 @@ class PresentationExtractor(Extractor):
|
|
260
164
|
|
261
165
|
@staticmethod
|
262
166
|
def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
|
263
|
-
"""Add structural information about the presentation."""
|
264
167
|
slide_count = len(presentation.slides)
|
265
168
|
if slide_count == 0:
|
266
169
|
return
|
267
170
|
|
268
|
-
# Build description
|
269
171
|
structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
|
270
172
|
|
271
173
|
slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
|
@@ -274,7 +176,6 @@ class PresentationExtractor(Extractor):
|
|
274
176
|
|
275
177
|
metadata["description"] = structure_info
|
276
178
|
|
277
|
-
# Build summary if not already present
|
278
179
|
if "summary" not in metadata:
|
279
180
|
summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
|
280
181
|
if slides_with_notes > 0:
|
@@ -10,15 +10,17 @@ from io import StringIO
|
|
10
10
|
from pathlib import Path
|
11
11
|
from typing import Any
|
12
12
|
|
13
|
+
import polars as pl
|
13
14
|
from anyio import Path as AsyncPath
|
14
15
|
from PIL import Image
|
15
16
|
from python_calamine import CalamineWorkbook
|
16
17
|
|
17
18
|
from kreuzberg._extractors._base import Extractor
|
18
19
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
|
19
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
20
|
+
from kreuzberg._types import ExtractionResult, Metadata, TableData
|
20
21
|
from kreuzberg._utils._string import normalize_spaces
|
21
22
|
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
23
|
+
from kreuzberg._utils._table import enhance_table_markdown
|
22
24
|
from kreuzberg._utils._tmp import create_temp_file
|
23
25
|
from kreuzberg.exceptions import ParsingError
|
24
26
|
|
@@ -70,7 +72,6 @@ class SpreadSheetExtractor(Extractor):
|
|
70
72
|
) from e
|
71
73
|
|
72
74
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
73
|
-
"""Pure sync implementation of extract_bytes."""
|
74
75
|
fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
|
75
76
|
|
76
77
|
try:
|
@@ -83,7 +84,6 @@ class SpreadSheetExtractor(Extractor):
|
|
83
84
|
Path(temp_path).unlink()
|
84
85
|
|
85
86
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
86
|
-
"""Pure sync implementation of extract_path."""
|
87
87
|
try:
|
88
88
|
workbook = CalamineWorkbook.from_path(str(path))
|
89
89
|
results = []
|
@@ -108,14 +108,6 @@ class SpreadSheetExtractor(Extractor):
|
|
108
108
|
|
109
109
|
@staticmethod
|
110
110
|
def _convert_cell_to_str(value: Any) -> str:
|
111
|
-
"""Convert a cell value to string representation.
|
112
|
-
|
113
|
-
Args:
|
114
|
-
value: The cell value to convert.
|
115
|
-
|
116
|
-
Returns:
|
117
|
-
String representation of the cell value.
|
118
|
-
"""
|
119
111
|
if value is None:
|
120
112
|
return ""
|
121
113
|
if isinstance(value, bool):
|
@@ -139,7 +131,7 @@ class SpreadSheetExtractor(Extractor):
|
|
139
131
|
csv_buffer.close()
|
140
132
|
|
141
133
|
csv_path, unlink = await create_temp_file(".csv")
|
142
|
-
await AsyncPath(csv_path).write_text(csv_data)
|
134
|
+
await AsyncPath(csv_path).write_text(csv_data, encoding="utf-8")
|
143
135
|
|
144
136
|
csv_reader = csv.reader(StringIO(csv_data))
|
145
137
|
rows = list(csv_reader)
|
@@ -162,7 +154,6 @@ class SpreadSheetExtractor(Extractor):
|
|
162
154
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
163
155
|
|
164
156
|
def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
165
|
-
"""Synchronous version of _convert_sheet_to_text."""
|
166
157
|
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
167
158
|
|
168
159
|
csv_buffer = StringIO()
|
@@ -195,82 +186,57 @@ class SpreadSheetExtractor(Extractor):
|
|
195
186
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
196
187
|
|
197
188
|
def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
198
|
-
"""Enhanced sheet processing with better table structure preservation."""
|
199
189
|
try:
|
200
|
-
# pandas is optional dependency
|
201
|
-
import pandas as pd # noqa: PLC0415
|
202
|
-
|
203
|
-
from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
|
204
|
-
|
205
190
|
sheet = workbook.get_sheet_by_name(sheet_name)
|
206
191
|
data = sheet.to_python()
|
207
192
|
|
208
193
|
if not data or not any(row for row in data):
|
209
194
|
return f"## {sheet_name}\n\n*Empty sheet*"
|
210
195
|
|
211
|
-
|
212
|
-
df = pd.DataFrame(data)
|
196
|
+
df = pl.DataFrame(data)
|
213
197
|
|
214
|
-
|
215
|
-
df = df.
|
198
|
+
df = df.filter(~pl.all_horizontal(pl.all().is_null()))
|
199
|
+
df = df.select([col for col in df.columns if not df[col].is_null().all()])
|
216
200
|
|
217
|
-
if df.
|
201
|
+
if df.is_empty():
|
218
202
|
return f"## {sheet_name}\n\n*No data*"
|
219
203
|
|
220
|
-
# Create a mock TableData for enhanced formatting
|
221
|
-
from kreuzberg._types import TableData # noqa: PLC0415
|
222
|
-
|
223
|
-
# Create a 1x1 transparent image as placeholder
|
224
204
|
placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
|
225
205
|
mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
|
226
206
|
|
227
207
|
enhanced_markdown = enhance_table_markdown(mock_table)
|
228
208
|
return f"## {sheet_name}\n\n{enhanced_markdown}"
|
229
209
|
|
230
|
-
except (
|
231
|
-
# Fallback to original method if pandas/table enhancement fails
|
210
|
+
except (AttributeError, ValueError):
|
232
211
|
return self._convert_sheet_to_text_sync(workbook, sheet_name)
|
233
212
|
|
234
213
|
@staticmethod
|
235
214
|
def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
|
236
|
-
"""Extract metadata from spreadsheet using python-calamine.
|
237
|
-
|
238
|
-
Args:
|
239
|
-
workbook: CalamineWorkbook instance
|
240
|
-
|
241
|
-
Returns:
|
242
|
-
Metadata dict using existing metadata keys where possible
|
243
|
-
"""
|
244
215
|
metadata: Metadata = {}
|
245
216
|
|
246
|
-
# Extract basic document properties
|
247
217
|
SpreadSheetExtractor._extract_document_properties(workbook, metadata)
|
248
218
|
|
249
|
-
# Add structural information
|
250
219
|
SpreadSheetExtractor._add_structure_info(workbook, metadata)
|
251
220
|
|
252
|
-
# Analyze content complexity
|
253
221
|
SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
|
254
222
|
|
255
223
|
return metadata
|
256
224
|
|
257
225
|
@staticmethod
|
258
226
|
def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
259
|
-
"""Extract basic document properties from workbook."""
|
260
227
|
with contextlib.suppress(AttributeError, Exception):
|
261
228
|
if not (hasattr(workbook, "metadata") and workbook.metadata):
|
262
229
|
return
|
263
230
|
|
264
231
|
props = workbook.metadata
|
265
232
|
|
266
|
-
# Basic properties mapping
|
267
233
|
property_mapping = {
|
268
234
|
"title": "title",
|
269
|
-
"author": "authors",
|
235
|
+
"author": "authors",
|
270
236
|
"subject": "subject",
|
271
237
|
"comments": "comments",
|
272
|
-
"keywords": "keywords",
|
273
|
-
"category": "categories",
|
238
|
+
"keywords": "keywords",
|
239
|
+
"category": "categories",
|
274
240
|
"company": "organization",
|
275
241
|
"manager": "modified_by",
|
276
242
|
}
|
@@ -286,12 +252,10 @@ class SpreadSheetExtractor(Extractor):
|
|
286
252
|
else:
|
287
253
|
metadata[meta_key] = value # type: ignore[literal-required]
|
288
254
|
|
289
|
-
# Handle dates separately
|
290
255
|
SpreadSheetExtractor._extract_date_properties(props, metadata)
|
291
256
|
|
292
257
|
@staticmethod
|
293
258
|
def _extract_date_properties(props: Any, metadata: Metadata) -> None:
|
294
|
-
"""Extract and format date properties."""
|
295
259
|
date_mapping = {"created": "created_at", "modified": "modified_at"}
|
296
260
|
|
297
261
|
for prop_name, meta_key in date_mapping.items():
|
@@ -304,14 +268,12 @@ class SpreadSheetExtractor(Extractor):
|
|
304
268
|
|
305
269
|
@staticmethod
|
306
270
|
def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
307
|
-
"""Add structural information about the spreadsheet."""
|
308
271
|
if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
|
309
272
|
return
|
310
273
|
|
311
274
|
sheet_count = len(workbook.sheet_names)
|
312
275
|
structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
|
313
276
|
|
314
|
-
# Don't list too many sheet names (magic number made constant)
|
315
277
|
max_sheet_names_to_list = 5
|
316
278
|
if sheet_count <= max_sheet_names_to_list:
|
317
279
|
structure_info += f": {', '.join(workbook.sheet_names)}"
|
@@ -320,12 +282,10 @@ class SpreadSheetExtractor(Extractor):
|
|
320
282
|
|
321
283
|
@staticmethod
|
322
284
|
def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
323
|
-
"""Analyze spreadsheet content for complexity indicators."""
|
324
285
|
with contextlib.suppress(Exception):
|
325
286
|
has_formulas = False
|
326
287
|
total_cells = 0
|
327
288
|
|
328
|
-
# Check only first few sheets for performance
|
329
289
|
max_sheets_to_check = 3
|
330
290
|
max_rows_to_check = 50
|
331
291
|
|
@@ -335,17 +295,15 @@ class SpreadSheetExtractor(Extractor):
|
|
335
295
|
data = sheet.to_python()
|
336
296
|
|
337
297
|
for row in data[:max_rows_to_check]:
|
338
|
-
if not row:
|
298
|
+
if not row:
|
339
299
|
continue
|
340
300
|
|
341
301
|
total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
|
342
302
|
|
343
|
-
# Check for formulas (simple heuristic)
|
344
303
|
if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
|
345
304
|
has_formulas = True
|
346
305
|
break
|
347
306
|
|
348
|
-
# Build summary
|
349
307
|
summary_parts = []
|
350
308
|
if total_cells > 0:
|
351
309
|
summary_parts.append(f"Contains {total_cells}+ data cells")
|
@@ -28,7 +28,6 @@ from kreuzberg._utils._sync import run_sync
|
|
28
28
|
if TYPE_CHECKING:
|
29
29
|
from pathlib import Path
|
30
30
|
|
31
|
-
# Define text field keywords as a set for O(1) membership testing
|
32
31
|
_TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
|
33
32
|
|
34
33
|
|
@@ -79,7 +78,6 @@ class StructuredDataExtractor(Extractor):
|
|
79
78
|
text_parts: list[str] = []
|
80
79
|
metadata: dict[str, Any] = {}
|
81
80
|
|
82
|
-
# Use match statement for cleaner code and avoid multiple isinstance calls
|
83
81
|
if isinstance(data, dict):
|
84
82
|
text_parts = self._extract_from_dict(data, metadata)
|
85
83
|
elif isinstance(data, list):
|
@@ -96,7 +94,7 @@ class StructuredDataExtractor(Extractor):
|
|
96
94
|
chunks=[],
|
97
95
|
)
|
98
96
|
|
99
|
-
except (
|
97
|
+
except (ValueError, TypeError) as e:
|
100
98
|
return ExtractionResult(
|
101
99
|
content=normalize_spaces(text_content),
|
102
100
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
@@ -117,7 +115,6 @@ class StructuredDataExtractor(Extractor):
|
|
117
115
|
if isinstance(value, str) and value.strip():
|
118
116
|
text_parts.append(f"{full_key}: {value}")
|
119
117
|
|
120
|
-
# Check if key contains any text field keywords efficiently
|
121
118
|
key_lower = key.lower()
|
122
119
|
if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
|
123
120
|
metadata[full_key] = value
|