kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. kreuzberg/_extractors/_base.py +40 -0
  2. kreuzberg/_extractors/_email.py +149 -0
  3. kreuzberg/_extractors/_html.py +15 -3
  4. kreuzberg/_extractors/_image.py +17 -18
  5. kreuzberg/_extractors/_pdf.py +68 -14
  6. kreuzberg/_extractors/_presentation.py +62 -10
  7. kreuzberg/_extractors/_spread_sheet.py +179 -4
  8. kreuzberg/_extractors/_structured.py +148 -0
  9. kreuzberg/_gmft.py +2 -2
  10. kreuzberg/_mcp/__init__.py +5 -0
  11. kreuzberg/_mcp/server.py +227 -0
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_multiprocessing/__init__.py +2 -3
  14. kreuzberg/_ocr/__init__.py +30 -0
  15. kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
  16. kreuzberg/_ocr/_sync.py +566 -0
  17. kreuzberg/_ocr/_tesseract.py +6 -2
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +17 -2
  21. kreuzberg/_utils/_process_pool.py +178 -1
  22. kreuzberg/_utils/_quality.py +237 -0
  23. kreuzberg/_utils/_serialization.py +4 -2
  24. kreuzberg/_utils/_string.py +153 -10
  25. kreuzberg/_utils/_sync.py +5 -2
  26. kreuzberg/_utils/_table.py +261 -0
  27. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
  28. kreuzberg-3.8.0.dist-info/RECORD +57 -0
  29. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
  30. kreuzberg/_multiprocessing/process_manager.py +0 -189
  31. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  32. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  33. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  34. kreuzberg-3.6.2.dist-info/RECORD +0 -54
  35. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -13,7 +13,7 @@ from python_calamine import CalamineWorkbook
13
13
 
14
14
  from kreuzberg._extractors._base import Extractor
15
15
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
16
- from kreuzberg._types import ExtractionResult
16
+ from kreuzberg._types import ExtractionResult, Metadata
17
17
  from kreuzberg._utils._string import normalize_spaces
18
18
  from kreuzberg._utils._sync import run_sync, run_taskgroup
19
19
  from kreuzberg._utils._tmp import create_temp_file
@@ -45,9 +45,14 @@ class SpreadSheetExtractor(Extractor):
45
45
  try:
46
46
  results: list[str] = await run_taskgroup(*tasks)
47
47
 
48
- return ExtractionResult(
49
- content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[]
48
+ result = ExtractionResult(
49
+ content="\n\n".join(results),
50
+ mime_type=MARKDOWN_MIME_TYPE,
51
+ metadata=self._extract_spreadsheet_metadata(workbook),
52
+ chunks=[],
50
53
  )
54
+
55
+ return self._apply_quality_processing(result)
51
56
  except ExceptionGroup as eg:
52
57
  raise ParsingError(
53
58
  "Failed to extract file data",
@@ -87,7 +92,14 @@ class SpreadSheetExtractor(Extractor):
87
92
  sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
88
93
  results.append(sheet_text)
89
94
 
90
- return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
95
+ result = ExtractionResult(
96
+ content="\n\n".join(results),
97
+ mime_type=MARKDOWN_MIME_TYPE,
98
+ metadata=self._extract_spreadsheet_metadata(workbook),
99
+ chunks=[],
100
+ )
101
+
102
+ return self._apply_quality_processing(result)
91
103
  except Exception as e:
92
104
  raise ParsingError(
93
105
  "Failed to extract file data",
@@ -181,3 +193,166 @@ class SpreadSheetExtractor(Extractor):
181
193
  result = "\n".join(markdown_lines)
182
194
 
183
195
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
196
+
197
+ def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
198
+ """Enhanced sheet processing with better table structure preservation."""
199
+ try:
200
+ # pandas is optional dependency
201
+ import pandas as pd
202
+
203
+ from kreuzberg._utils._table import enhance_table_markdown
204
+
205
+ sheet = workbook.get_sheet_by_name(sheet_name)
206
+ data = sheet.to_python()
207
+
208
+ if not data or not any(row for row in data):
209
+ return f"## {sheet_name}\n\n*Empty sheet*"
210
+
211
+ # Convert to DataFrame
212
+ df = pd.DataFrame(data)
213
+
214
+ # Clean up empty rows and columns
215
+ df = df.dropna(how="all").dropna(axis=1, how="all")
216
+
217
+ if df.empty:
218
+ return f"## {sheet_name}\n\n*No data*"
219
+
220
+ # Create a mock TableData for enhanced formatting
221
+ from PIL import Image
222
+
223
+ from kreuzberg._types import TableData
224
+
225
+ # Create a 1x1 transparent image as placeholder
226
+ placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
227
+ mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
228
+
229
+ enhanced_markdown = enhance_table_markdown(mock_table)
230
+ return f"## {sheet_name}\n\n{enhanced_markdown}"
231
+
232
+ except (ImportError, AttributeError, ValueError):
233
+ # Fallback to original method if pandas/table enhancement fails
234
+ return self._convert_sheet_to_text_sync(workbook, sheet_name)
235
+
236
+ @staticmethod
237
+ def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
238
+ """Extract metadata from spreadsheet using python-calamine.
239
+
240
+ Args:
241
+ workbook: CalamineWorkbook instance
242
+
243
+ Returns:
244
+ Metadata dict using existing metadata keys where possible
245
+ """
246
+ metadata: Metadata = {}
247
+
248
+ # Extract basic document properties
249
+ SpreadSheetExtractor._extract_document_properties(workbook, metadata)
250
+
251
+ # Add structural information
252
+ SpreadSheetExtractor._add_structure_info(workbook, metadata)
253
+
254
+ # Analyze content complexity
255
+ SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
256
+
257
+ return metadata
258
+
259
+ @staticmethod
260
+ def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
261
+ """Extract basic document properties from workbook."""
262
+ with contextlib.suppress(AttributeError, Exception):
263
+ if not (hasattr(workbook, "metadata") and workbook.metadata):
264
+ return
265
+
266
+ props = workbook.metadata
267
+
268
+ # Basic properties mapping
269
+ property_mapping = {
270
+ "title": "title",
271
+ "author": "authors", # Convert to list
272
+ "subject": "subject",
273
+ "comments": "comments",
274
+ "keywords": "keywords", # Process separately
275
+ "category": "categories", # Convert to list
276
+ "company": "organization",
277
+ "manager": "modified_by",
278
+ }
279
+
280
+ for prop_name, meta_key in property_mapping.items():
281
+ if hasattr(props, prop_name) and (value := getattr(props, prop_name)):
282
+ if meta_key in ("authors", "categories"):
283
+ metadata[meta_key] = [value] # type: ignore[literal-required]
284
+ elif meta_key == "keywords":
285
+ keywords = [k.strip() for k in value.replace(";", ",").split(",") if k.strip()]
286
+ if keywords:
287
+ metadata[meta_key] = keywords # type: ignore[literal-required]
288
+ else:
289
+ metadata[meta_key] = value # type: ignore[literal-required]
290
+
291
+ # Handle dates separately
292
+ SpreadSheetExtractor._extract_date_properties(props, metadata)
293
+
294
+ @staticmethod
295
+ def _extract_date_properties(props: Any, metadata: Metadata) -> None:
296
+ """Extract and format date properties."""
297
+ date_mapping = {"created": "created_at", "modified": "modified_at"}
298
+
299
+ for prop_name, meta_key in date_mapping.items():
300
+ if hasattr(props, prop_name) and (date_value := getattr(props, prop_name)):
301
+ with contextlib.suppress(Exception):
302
+ if hasattr(date_value, "isoformat"):
303
+ metadata[meta_key] = date_value.isoformat() # type: ignore[literal-required]
304
+ else:
305
+ metadata[meta_key] = str(date_value) # type: ignore[literal-required]
306
+
307
+ @staticmethod
308
+ def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
309
+ """Add structural information about the spreadsheet."""
310
+ if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
311
+ return
312
+
313
+ sheet_count = len(workbook.sheet_names)
314
+ structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
315
+
316
+ # Don't list too many sheet names (magic number made constant)
317
+ max_sheet_names_to_list = 5
318
+ if sheet_count <= max_sheet_names_to_list:
319
+ structure_info += f": {', '.join(workbook.sheet_names)}"
320
+
321
+ metadata["description"] = structure_info
322
+
323
+ @staticmethod
324
+ def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
325
+ """Analyze spreadsheet content for complexity indicators."""
326
+ with contextlib.suppress(Exception):
327
+ has_formulas = False
328
+ total_cells = 0
329
+
330
+ # Check only first few sheets for performance
331
+ max_sheets_to_check = 3
332
+ max_rows_to_check = 50
333
+
334
+ for sheet_name in workbook.sheet_names[:max_sheets_to_check]:
335
+ with contextlib.suppress(Exception):
336
+ sheet = workbook.get_sheet_by_name(sheet_name)
337
+ data = sheet.to_python()
338
+
339
+ for row in data[:max_rows_to_check]:
340
+ if not row: # Skip empty rows
341
+ continue
342
+
343
+ total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
344
+
345
+ # Check for formulas (simple heuristic)
346
+ if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
347
+ has_formulas = True
348
+ break
349
+
350
+ # Build summary
351
+ summary_parts = []
352
+ if total_cells > 0:
353
+ summary_parts.append(f"Contains {total_cells}+ data cells")
354
+ if has_formulas:
355
+ summary_parts.append("includes formulas")
356
+
357
+ if summary_parts and "summary" not in metadata:
358
+ metadata["summary"] = f"Spreadsheet that {', '.join(summary_parts)}."
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
+
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
10
+ from kreuzberg._types import ExtractionResult, normalize_metadata
11
+ from kreuzberg._utils._string import normalize_spaces, safe_decode
12
+ from kreuzberg._utils._sync import run_sync
13
+
14
+ if TYPE_CHECKING:
15
+ from pathlib import Path
16
+
17
+
18
+ class StructuredDataExtractor(Extractor):
19
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
20
+ JSON_MIME_TYPE,
21
+ "text/json",
22
+ YAML_MIME_TYPE,
23
+ "text/yaml",
24
+ "text/x-yaml",
25
+ "application/yaml",
26
+ TOML_MIME_TYPE,
27
+ "text/toml",
28
+ }
29
+
30
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
31
+ return await run_sync(self.extract_bytes_sync, content)
32
+
33
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
34
+ content = await AsyncPath(path).read_bytes()
35
+ return await self.extract_bytes_async(content)
36
+
37
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
38
+ text_content = safe_decode(content)
39
+
40
+ try:
41
+ if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
42
+ data = json.loads(text_content)
43
+ elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
44
+ try:
45
+ import tomllib # type: ignore[import-not-found]
46
+ except ImportError:
47
+ try:
48
+ import tomli as tomllib # type: ignore[import-not-found]
49
+ except ImportError:
50
+ return ExtractionResult(
51
+ content=normalize_spaces(text_content),
52
+ mime_type=PLAIN_TEXT_MIME_TYPE,
53
+ metadata={"warning": "tomllib/tomli not available, returning raw text"},
54
+ chunks=[],
55
+ )
56
+ data = tomllib.loads(text_content)
57
+ else:
58
+ try:
59
+ import yaml
60
+
61
+ data = yaml.safe_load(text_content)
62
+ except ImportError:
63
+ return ExtractionResult(
64
+ content=normalize_spaces(text_content),
65
+ mime_type=PLAIN_TEXT_MIME_TYPE,
66
+ metadata={"warning": "PyYAML not available, returning raw text"},
67
+ chunks=[],
68
+ )
69
+
70
+ text_parts: list[str] = []
71
+ metadata: dict[str, Any] = {}
72
+
73
+ if isinstance(data, dict):
74
+ text_parts.extend(self._extract_from_dict(data, metadata))
75
+ elif isinstance(data, list):
76
+ text_parts.extend(self._extract_from_list(data, metadata))
77
+ else:
78
+ text_parts.append(str(data))
79
+
80
+ combined_text = "\n".join(text_parts) if text_parts else text_content
81
+
82
+ return ExtractionResult(
83
+ content=normalize_spaces(combined_text),
84
+ mime_type=PLAIN_TEXT_MIME_TYPE,
85
+ metadata=normalize_metadata(metadata),
86
+ chunks=[],
87
+ )
88
+
89
+ except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
90
+ return ExtractionResult(
91
+ content=normalize_spaces(text_content),
92
+ mime_type=PLAIN_TEXT_MIME_TYPE,
93
+ metadata={"parse_error": str(e)},
94
+ chunks=[],
95
+ )
96
+
97
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
98
+ content = path.read_bytes()
99
+ return self.extract_bytes_sync(content)
100
+
101
+ def _extract_from_dict(self, data: dict[str, Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
102
+ text_parts = []
103
+
104
+ for key, value in data.items():
105
+ full_key = f"{prefix}.{key}" if prefix else key
106
+
107
+ if isinstance(value, str) and value.strip():
108
+ text_parts.append(f"{full_key}: {value}")
109
+
110
+ if any(
111
+ text_field in key.lower()
112
+ for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
113
+ ):
114
+ metadata[full_key] = value
115
+
116
+ elif isinstance(value, (int, float, bool)):
117
+ text_parts.append(f"{full_key}: {value}")
118
+
119
+ elif isinstance(value, dict):
120
+ text_parts.extend(self._extract_from_dict(value, metadata, full_key))
121
+
122
+ elif isinstance(value, list):
123
+ text_parts.extend(self._extract_from_list(value, metadata, full_key))
124
+
125
+ elif value is not None:
126
+ text_parts.append(f"{full_key}: {value!s}")
127
+
128
+ return text_parts
129
+
130
+ def _extract_from_list(self, data: list[Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
131
+ text_parts = []
132
+
133
+ for i, item in enumerate(data):
134
+ item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
135
+
136
+ if isinstance(item, str) and item.strip():
137
+ text_parts.append(f"{item_key}: {item}")
138
+
139
+ elif isinstance(item, dict):
140
+ text_parts.extend(self._extract_from_dict(item, metadata, item_key))
141
+
142
+ elif isinstance(item, list):
143
+ text_parts.extend(self._extract_from_list(item, metadata, item_key))
144
+
145
+ elif item is not None:
146
+ text_parts.append(f"{item_key}: {item!s}")
147
+
148
+ return text_parts
kreuzberg/_gmft.py CHANGED
@@ -196,7 +196,7 @@ async def extract_tables( # noqa: PLR0915
196
196
 
197
197
  try:
198
198
  if use_isolated_process:
199
- from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
199
+ from kreuzberg._multiprocessing import extract_tables_isolated_async
200
200
 
201
201
  result = await extract_tables_isolated_async(file_path, config)
202
202
 
@@ -314,7 +314,7 @@ def extract_tables_sync(
314
314
  return cached_result # type: ignore[no-any-return]
315
315
 
316
316
  if use_isolated_process:
317
- from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
317
+ from kreuzberg._multiprocessing import extract_tables_isolated
318
318
 
319
319
  result = extract_tables_isolated(file_path, config)
320
320
 
@@ -0,0 +1,5 @@
1
+ """MCP server for Kreuzberg text extraction."""
2
+
3
+ from .server import mcp
4
+
5
+ __all__ = ["mcp"]
@@ -0,0 +1,227 @@
1
+ """Kreuzberg MCP server implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from typing import Any
7
+
8
+ from mcp.server import FastMCP
9
+ from mcp.types import TextContent
10
+
11
+ from kreuzberg._types import ExtractionConfig, OcrBackendType
12
+ from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
13
+
14
+ # Create the MCP server
15
+ mcp = FastMCP("Kreuzberg Text Extraction")
16
+
17
+
18
+ @mcp.tool()
19
+ def extract_document( # noqa: PLR0913
20
+ file_path: str,
21
+ mime_type: str | None = None,
22
+ force_ocr: bool = False,
23
+ chunk_content: bool = False,
24
+ extract_tables: bool = False,
25
+ extract_entities: bool = False,
26
+ extract_keywords: bool = False,
27
+ ocr_backend: OcrBackendType = "tesseract",
28
+ max_chars: int = 1000,
29
+ max_overlap: int = 200,
30
+ keyword_count: int = 10,
31
+ auto_detect_language: bool = False,
32
+ ) -> dict[str, Any]:
33
+ """Extract text content from a document file.
34
+
35
+ Args:
36
+ file_path: Path to the document file
37
+ mime_type: MIME type of the document (auto-detected if not provided)
38
+ force_ocr: Force OCR even for text-based documents
39
+ chunk_content: Split content into chunks
40
+ extract_tables: Extract tables from the document
41
+ extract_entities: Extract named entities
42
+ extract_keywords: Extract keywords
43
+ ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
44
+ max_chars: Maximum characters per chunk
45
+ max_overlap: Character overlap between chunks
46
+ keyword_count: Number of keywords to extract
47
+ auto_detect_language: Auto-detect document language
48
+
49
+ Returns:
50
+ Extracted content with metadata, tables, chunks, entities, and keywords
51
+ """
52
+ config = ExtractionConfig(
53
+ force_ocr=force_ocr,
54
+ chunk_content=chunk_content,
55
+ extract_tables=extract_tables,
56
+ extract_entities=extract_entities,
57
+ extract_keywords=extract_keywords,
58
+ ocr_backend=ocr_backend,
59
+ max_chars=max_chars,
60
+ max_overlap=max_overlap,
61
+ keyword_count=keyword_count,
62
+ auto_detect_language=auto_detect_language,
63
+ )
64
+
65
+ result = extract_file_sync(file_path, mime_type, config)
66
+ return result.to_dict()
67
+
68
+
69
+ @mcp.tool()
70
+ def extract_bytes( # noqa: PLR0913
71
+ content_base64: str,
72
+ mime_type: str,
73
+ force_ocr: bool = False,
74
+ chunk_content: bool = False,
75
+ extract_tables: bool = False,
76
+ extract_entities: bool = False,
77
+ extract_keywords: bool = False,
78
+ ocr_backend: OcrBackendType = "tesseract",
79
+ max_chars: int = 1000,
80
+ max_overlap: int = 200,
81
+ keyword_count: int = 10,
82
+ auto_detect_language: bool = False,
83
+ ) -> dict[str, Any]:
84
+ """Extract text content from document bytes.
85
+
86
+ Args:
87
+ content_base64: Base64-encoded document content
88
+ mime_type: MIME type of the document
89
+ force_ocr: Force OCR even for text-based documents
90
+ chunk_content: Split content into chunks
91
+ extract_tables: Extract tables from the document
92
+ extract_entities: Extract named entities
93
+ extract_keywords: Extract keywords
94
+ ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
95
+ max_chars: Maximum characters per chunk
96
+ max_overlap: Character overlap between chunks
97
+ keyword_count: Number of keywords to extract
98
+ auto_detect_language: Auto-detect document language
99
+
100
+ Returns:
101
+ Extracted content with metadata, tables, chunks, entities, and keywords
102
+ """
103
+ content_bytes = base64.b64decode(content_base64)
104
+
105
+ config = ExtractionConfig(
106
+ force_ocr=force_ocr,
107
+ chunk_content=chunk_content,
108
+ extract_tables=extract_tables,
109
+ extract_entities=extract_entities,
110
+ extract_keywords=extract_keywords,
111
+ ocr_backend=ocr_backend,
112
+ max_chars=max_chars,
113
+ max_overlap=max_overlap,
114
+ keyword_count=keyword_count,
115
+ auto_detect_language=auto_detect_language,
116
+ )
117
+
118
+ result = extract_bytes_sync(content_bytes, mime_type, config)
119
+ return result.to_dict()
120
+
121
+
122
+ @mcp.tool()
123
+ def extract_simple(
124
+ file_path: str,
125
+ mime_type: str | None = None,
126
+ ) -> str:
127
+ """Simple text extraction from a document file.
128
+
129
+ Args:
130
+ file_path: Path to the document file
131
+ mime_type: MIME type of the document (auto-detected if not provided)
132
+
133
+ Returns:
134
+ Extracted text content as a string
135
+ """
136
+ config = ExtractionConfig()
137
+ result = extract_file_sync(file_path, mime_type, config)
138
+ return result.content
139
+
140
+
141
+ @mcp.resource("config://default")
142
+ def get_default_config() -> str:
143
+ """Get the default extraction configuration."""
144
+ config = ExtractionConfig()
145
+ return str(config.__dict__)
146
+
147
+
148
+ @mcp.resource("config://available-backends")
149
+ def get_available_backends() -> str:
150
+ """Get available OCR backends."""
151
+ return "tesseract, easyocr, paddleocr"
152
+
153
+
154
+ @mcp.resource("extractors://supported-formats")
155
+ def get_supported_formats() -> str:
156
+ """Get supported document formats."""
157
+ return """
158
+ Supported formats:
159
+ - PDF documents
160
+ - Images (PNG, JPG, JPEG, TIFF, BMP, WEBP)
161
+ - Office documents (DOCX, PPTX, XLSX)
162
+ - HTML files
163
+ - Text files (TXT, CSV, TSV)
164
+ - And more...
165
+ """
166
+
167
+
168
+ @mcp.prompt()
169
+ def extract_and_summarize(file_path: str) -> list[TextContent]:
170
+ """Extract text from a document and provide a summary prompt.
171
+
172
+ Args:
173
+ file_path: Path to the document file
174
+
175
+ Returns:
176
+ Extracted content with summarization prompt
177
+ """
178
+ result = extract_file_sync(file_path, None, ExtractionConfig())
179
+
180
+ return [
181
+ TextContent(
182
+ type="text",
183
+ text=f"Document Content:\n{result.content}\n\nPlease provide a concise summary of this document.",
184
+ )
185
+ ]
186
+
187
+
188
+ @mcp.prompt()
189
+ def extract_structured(file_path: str) -> list[TextContent]:
190
+ """Extract text with structured analysis prompt.
191
+
192
+ Args:
193
+ file_path: Path to the document file
194
+
195
+ Returns:
196
+ Extracted content with structured analysis prompt
197
+ """
198
+ config = ExtractionConfig(
199
+ extract_entities=True,
200
+ extract_keywords=True,
201
+ extract_tables=True,
202
+ )
203
+ result = extract_file_sync(file_path, None, config)
204
+
205
+ content = f"Document Content:\n{result.content}\n\n"
206
+
207
+ if result.entities:
208
+ content += f"Entities: {[f'{e.text} ({e.type})' for e in result.entities]}\n\n"
209
+
210
+ if result.keywords:
211
+ content += f"Keywords: {[f'{kw[0]} ({kw[1]:.2f})' for kw in result.keywords]}\n\n"
212
+
213
+ if result.tables:
214
+ content += f"Tables found: {len(result.tables)}\n\n"
215
+
216
+ content += "Please analyze this document and provide structured insights."
217
+
218
+ return [TextContent(type="text", text=content)]
219
+
220
+
221
+ def main() -> None:
222
+ """Main entry point for the MCP server."""
223
+ mcp.run()
224
+
225
+
226
+ if __name__ == "__main__":
227
+ main()
kreuzberg/_mime_types.py CHANGED
@@ -17,6 +17,12 @@ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
18
  DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
19
19
 
20
+ EML_MIME_TYPE: Final = "message/rfc822"
21
+ MSG_MIME_TYPE: Final = "application/vnd.ms-outlook"
22
+ JSON_MIME_TYPE: Final = "application/json"
23
+ YAML_MIME_TYPE: Final = "application/x-yaml"
24
+ TOML_MIME_TYPE: Final = "application/toml"
25
+
20
26
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
21
27
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
22
28
  EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -127,6 +133,12 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
127
133
  ".org": "text/x-org",
128
134
  ".epub": "application/epub+zip",
129
135
  ".rtf": "application/rtf",
136
+ ".eml": EML_MIME_TYPE,
137
+ ".msg": MSG_MIME_TYPE,
138
+ ".json": JSON_MIME_TYPE,
139
+ ".yaml": YAML_MIME_TYPE,
140
+ ".yml": YAML_MIME_TYPE,
141
+ ".toml": TOML_MIME_TYPE,
130
142
  ".odt": "application/vnd.oasis.opendocument.text",
131
143
  ".docx": DOCX_MIME_TYPE,
132
144
  ".bib": "application/x-bibtex",
@@ -139,7 +151,21 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
139
151
  | IMAGE_MIME_TYPES
140
152
  | PANDOC_SUPPORTED_MIME_TYPES
141
153
  | SPREADSHEET_MIME_TYPES
142
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
154
+ | {
155
+ PDF_MIME_TYPE,
156
+ POWER_POINT_MIME_TYPE,
157
+ HTML_MIME_TYPE,
158
+ EML_MIME_TYPE,
159
+ MSG_MIME_TYPE,
160
+ JSON_MIME_TYPE,
161
+ YAML_MIME_TYPE,
162
+ TOML_MIME_TYPE,
163
+ "text/json",
164
+ "text/yaml",
165
+ "text/x-yaml",
166
+ "application/yaml",
167
+ "text/toml",
168
+ }
143
169
  )
144
170
 
145
171
 
@@ -1,6 +1,5 @@
1
1
  """Multiprocessing utilities for kreuzberg."""
2
2
 
3
- from .process_manager import ProcessPoolManager
4
- from .tesseract_pool import TesseractProcessPool
3
+ from .gmft_isolated import extract_tables_isolated, extract_tables_isolated_async
5
4
 
6
- __all__ = ["ProcessPoolManager", "TesseractProcessPool"]
5
+ __all__ = ["extract_tables_isolated", "extract_tables_isolated_async"]