classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
src/utils/file_parser.py
ADDED
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
"""MIME detection and text extraction utilities for local file parsing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import tempfile
|
|
7
|
+
import threading
|
|
8
|
+
from collections.abc import Generator
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from urllib.parse import urlsplit
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ParsedFile:
|
|
18
|
+
"""Result of parsing a local file."""
|
|
19
|
+
|
|
20
|
+
mime_type: str
|
|
21
|
+
text_content: str
|
|
22
|
+
is_binary: bool
|
|
23
|
+
file_size_bytes: int = 0
|
|
24
|
+
encoding: str | None = None
|
|
25
|
+
parse_error: str | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ParsedBytes:
|
|
30
|
+
"""Result of parsing in-memory bytes."""
|
|
31
|
+
|
|
32
|
+
mime_type: str
|
|
33
|
+
raw_content: str
|
|
34
|
+
text_content: str
|
|
35
|
+
is_binary: bool
|
|
36
|
+
file_size_bytes: int
|
|
37
|
+
parse_error: str | None = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_TEXT_RAW_MIME_TYPES = {
|
|
41
|
+
"application/json",
|
|
42
|
+
"application/xml",
|
|
43
|
+
"text/xml",
|
|
44
|
+
"application/xhtml+xml",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
_TABULAR_MIME_TYPES = {
|
|
48
|
+
"text/csv",
|
|
49
|
+
"text/tab-separated-values",
|
|
50
|
+
"application/vnd.ms-excel",
|
|
51
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
52
|
+
"application/parquet",
|
|
53
|
+
"application/vnd.apache.parquet",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
_MIME_HINTS_BY_EXTENSION = {
|
|
57
|
+
".csv": "text/csv",
|
|
58
|
+
".tsv": "text/tab-separated-values",
|
|
59
|
+
".parquet": "application/parquet",
|
|
60
|
+
".json": "application/json",
|
|
61
|
+
".xml": "application/xml",
|
|
62
|
+
".html": "text/html",
|
|
63
|
+
".htm": "text/html",
|
|
64
|
+
".md": "text/markdown",
|
|
65
|
+
".txt": "text/plain",
|
|
66
|
+
".pdf": "application/pdf",
|
|
67
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
68
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
69
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
70
|
+
".png": "image/png",
|
|
71
|
+
".jpg": "image/jpeg",
|
|
72
|
+
".jpeg": "image/jpeg",
|
|
73
|
+
".gif": "image/gif",
|
|
74
|
+
".bmp": "image/bmp",
|
|
75
|
+
".tif": "image/tiff",
|
|
76
|
+
".tiff": "image/tiff",
|
|
77
|
+
".webp": "image/webp",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
_DOCLING_IMAGE_MIME_TYPES = {
|
|
81
|
+
"image/png",
|
|
82
|
+
"image/jpeg",
|
|
83
|
+
"image/tiff",
|
|
84
|
+
"image/bmp",
|
|
85
|
+
"image/webp",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
_DOCLING_MIME_TYPES = {
|
|
89
|
+
"application/pdf",
|
|
90
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
91
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
92
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
93
|
+
"text/html",
|
|
94
|
+
"application/xhtml+xml",
|
|
95
|
+
}
|
|
96
|
+
_DOCLING_EXTENSIONS = {
|
|
97
|
+
".pdf",
|
|
98
|
+
".docx",
|
|
99
|
+
".pptx",
|
|
100
|
+
".xlsx",
|
|
101
|
+
".html",
|
|
102
|
+
".htm",
|
|
103
|
+
".png",
|
|
104
|
+
".jpg",
|
|
105
|
+
".jpeg",
|
|
106
|
+
".bmp",
|
|
107
|
+
".tif",
|
|
108
|
+
".tiff",
|
|
109
|
+
".webp",
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class _DoclingState:
|
|
114
|
+
"""Mutable singleton state for the Docling DocumentConverter.
|
|
115
|
+
|
|
116
|
+
Stored as object attributes so functions can mutate state without `global`
|
|
117
|
+
statements (which ruff PLW0603 disallows). Initializing the converter is
|
|
118
|
+
expensive (loads ML models), so it happens exactly once per process.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
def __init__(self) -> None:
|
|
122
|
+
self.converter: object = None
|
|
123
|
+
self.error: str | None = None
|
|
124
|
+
self.attempted: bool = False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
_docling_state = _DoclingState()
|
|
128
|
+
_docling_lock = threading.Lock()
|
|
129
|
+
# Limits concurrent converter.convert() calls to prevent OOM. The docling
|
|
130
|
+
# StandardPdfPipeline alone holds ~1 GB of model weights; additional working
|
|
131
|
+
# memory per in-flight conversion can push the process over the 4 GiB K8s
|
|
132
|
+
# limit when two conversions run simultaneously.
|
|
133
|
+
_docling_conversion_sem = threading.Semaphore(1)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _get_docling_converter() -> tuple[object, str | None]:
|
|
137
|
+
"""Return a cached DocumentConverter, initializing it on the first call."""
|
|
138
|
+
# Fast-path: only skip the lock once initialization is settled (converter
|
|
139
|
+
# ready or permanently failed). Checking `attempted` alone is not enough —
|
|
140
|
+
# `attempted` is set before the install+init finishes, so threads that reach
|
|
141
|
+
# here while another thread holds the lock would return (None, None) and
|
|
142
|
+
# emit a spurious "unavailable" warning instead of waiting.
|
|
143
|
+
if _docling_state.converter is not None or _docling_state.error is not None:
|
|
144
|
+
return _docling_state.converter, _docling_state.error
|
|
145
|
+
with _docling_lock:
|
|
146
|
+
if _docling_state.attempted:
|
|
147
|
+
return _docling_state.converter, _docling_state.error
|
|
148
|
+
_docling_state.attempted = True
|
|
149
|
+
try:
|
|
150
|
+
from ..sources.dependencies import require_module
|
|
151
|
+
|
|
152
|
+
converter_module = require_module(
|
|
153
|
+
"docling.document_converter",
|
|
154
|
+
"file parser OCR",
|
|
155
|
+
["ocr"],
|
|
156
|
+
detail="OCR extraction requires the Docling optional dependency.",
|
|
157
|
+
)
|
|
158
|
+
_docling_state.converter = converter_module.DocumentConverter()
|
|
159
|
+
except Exception as exc:
|
|
160
|
+
_docling_state.error = str(exc)
|
|
161
|
+
return _docling_state.converter, _docling_state.error
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _reset_docling_singleton() -> None:
|
|
165
|
+
"""Reset the cached Docling converter. Intended for test isolation only."""
|
|
166
|
+
with _docling_lock:
|
|
167
|
+
_docling_state.converter = None
|
|
168
|
+
_docling_state.error = None
|
|
169
|
+
_docling_state.attempted = False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _normalize_mime_type(mime_type: str | None) -> str:
|
|
173
|
+
if not mime_type:
|
|
174
|
+
return ""
|
|
175
|
+
return str(mime_type).split(";", 1)[0].strip().lower()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _file_extension(file_name: str) -> str:
|
|
179
|
+
if not file_name:
|
|
180
|
+
return ""
|
|
181
|
+
path = urlsplit(file_name).path
|
|
182
|
+
value = path if path else file_name
|
|
183
|
+
return Path(value).suffix.lower()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def infer_mime_type_from_file_name(file_name: str) -> str:
|
|
187
|
+
"""Infer MIME type from file name or URL path extension."""
|
|
188
|
+
extension = _file_extension(file_name)
|
|
189
|
+
return _MIME_HINTS_BY_EXTENSION.get(extension, "application/octet-stream")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def normalize_detected_mime_type(detected_mime_type: str, file_name: str) -> str:
|
|
193
|
+
"""
|
|
194
|
+
Normalize detected MIME with filename-based fallbacks.
|
|
195
|
+
|
|
196
|
+
Keeps parser behavior stable for sources that declare generic or plain-text
|
|
197
|
+
content-types for tabular files.
|
|
198
|
+
"""
|
|
199
|
+
mime = _normalize_mime_type(detected_mime_type)
|
|
200
|
+
inferred_mime = infer_mime_type_from_file_name(file_name)
|
|
201
|
+
|
|
202
|
+
if not mime or mime == "application/octet-stream":
|
|
203
|
+
return inferred_mime
|
|
204
|
+
|
|
205
|
+
if mime == "text/plain" and inferred_mime in _TABULAR_MIME_TYPES:
|
|
206
|
+
return inferred_mime
|
|
207
|
+
|
|
208
|
+
return mime
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _is_text_like_mime_type(mime_type: str) -> bool:
|
|
212
|
+
normalized_mime = _normalize_mime_type(mime_type)
|
|
213
|
+
return normalized_mime.startswith("text/") or normalized_mime in _TEXT_RAW_MIME_TYPES
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _detect_magic_mime_type(file_bytes: bytes) -> str | None:
|
|
217
|
+
signatures: tuple[tuple[bytes, str], ...] = (
|
|
218
|
+
(b"\x89PNG\r\n\x1a\n", "image/png"),
|
|
219
|
+
(b"%PDF-", "application/pdf"),
|
|
220
|
+
(b"\xff\xd8\xff", "image/jpeg"),
|
|
221
|
+
(b"GIF87a", "image/gif"),
|
|
222
|
+
(b"GIF89a", "image/gif"),
|
|
223
|
+
(b"PK\x03\x04", "application/zip"),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
for signature, mime_type in signatures:
|
|
227
|
+
if file_bytes.startswith(signature):
|
|
228
|
+
return mime_type
|
|
229
|
+
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _sniff_text_mime(file_bytes: bytes) -> str:
|
|
234
|
+
"""Fallback MIME detection for text formats not handled by filetype."""
|
|
235
|
+
# Check for null bytes → binary
|
|
236
|
+
if b"\x00" in file_bytes[:8192]:
|
|
237
|
+
return "application/octet-stream"
|
|
238
|
+
|
|
239
|
+
# Try to decode a sample for text-based sniffing
|
|
240
|
+
sample = ""
|
|
241
|
+
try:
|
|
242
|
+
import chardet
|
|
243
|
+
|
|
244
|
+
detected = chardet.detect(file_bytes[:4096])
|
|
245
|
+
encoding = detected.get("encoding") or "utf-8"
|
|
246
|
+
sample = file_bytes[:4096].decode(encoding, errors="replace")
|
|
247
|
+
except Exception:
|
|
248
|
+
try:
|
|
249
|
+
sample = file_bytes[:4096].decode("utf-8", errors="replace")
|
|
250
|
+
except Exception:
|
|
251
|
+
return "application/octet-stream"
|
|
252
|
+
|
|
253
|
+
stripped = sample.lstrip()
|
|
254
|
+
|
|
255
|
+
if stripped.startswith("{") or stripped.startswith("["):
|
|
256
|
+
return "application/json"
|
|
257
|
+
if stripped.startswith("<?xml"):
|
|
258
|
+
return "application/xml"
|
|
259
|
+
if stripped.lower().startswith("<!doctype html") or stripped.lower().startswith("<html"):
|
|
260
|
+
return "text/html"
|
|
261
|
+
|
|
262
|
+
# CSV heuristic: first non-empty line has multiple commas
|
|
263
|
+
first_line = stripped.split("\n")[0] if "\n" in stripped else stripped
|
|
264
|
+
if first_line.count(",") >= 2:
|
|
265
|
+
return "text/csv"
|
|
266
|
+
|
|
267
|
+
return "text/plain"
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def detect_mime_type(file_bytes: bytes) -> str:
|
|
271
|
+
"""
|
|
272
|
+
Detect MIME type from file bytes.
|
|
273
|
+
|
|
274
|
+
Uses magic-byte detection first (filetype library), then falls back to
|
|
275
|
+
text-based sniffing for formats that filetype doesn't cover.
|
|
276
|
+
"""
|
|
277
|
+
if not file_bytes:
|
|
278
|
+
return "application/octet-stream"
|
|
279
|
+
|
|
280
|
+
magic_mime_type = _detect_magic_mime_type(file_bytes)
|
|
281
|
+
if magic_mime_type:
|
|
282
|
+
return magic_mime_type
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
import filetype
|
|
286
|
+
|
|
287
|
+
kind = filetype.guess(file_bytes)
|
|
288
|
+
if kind is not None:
|
|
289
|
+
return str(kind.mime)
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.debug(f"filetype detection failed: {e}")
|
|
292
|
+
|
|
293
|
+
return _sniff_text_mime(file_bytes)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _supports_docling_ocr(mime_type: str, file_name: str) -> bool:
|
|
297
|
+
normalized = _normalize_mime_type(mime_type)
|
|
298
|
+
if normalized in _DOCLING_IMAGE_MIME_TYPES:
|
|
299
|
+
return True
|
|
300
|
+
if normalized in _DOCLING_MIME_TYPES:
|
|
301
|
+
return True
|
|
302
|
+
return _file_extension(file_name) in _DOCLING_EXTENSIONS
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# PDFs with fewer extracted chars than this are likely scanned/image-only and
|
|
306
|
+
# need the full docling OCR pipeline. Most text-layer PDFs yield hundreds of
|
|
307
|
+
# chars; a threshold of 50 is conservative enough to never skip real content.
|
|
308
|
+
_MIN_NATIVE_PDF_CHARS = 50
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _extract_pdf_text(file_bytes: bytes) -> tuple[str, str | None]:
|
|
312
|
+
"""Extract text from a PDF using pdfplumber (no ML models required)."""
|
|
313
|
+
try:
|
|
314
|
+
import io
|
|
315
|
+
|
|
316
|
+
import pdfplumber
|
|
317
|
+
|
|
318
|
+
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
|
319
|
+
pages = []
|
|
320
|
+
for page in pdf.pages:
|
|
321
|
+
text = page.extract_text() or ""
|
|
322
|
+
if text:
|
|
323
|
+
pages.append(text)
|
|
324
|
+
return "\n\n".join(pages), None
|
|
325
|
+
except Exception as e:
|
|
326
|
+
return "", f"PDF extraction failed: {e}"
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _temp_file_name(file_name: str, mime_type: str) -> str:
|
|
330
|
+
extension = _file_extension(file_name)
|
|
331
|
+
if extension:
|
|
332
|
+
return f"input{extension}"
|
|
333
|
+
|
|
334
|
+
for suffix, candidate_mime in _MIME_HINTS_BY_EXTENSION.items():
|
|
335
|
+
if candidate_mime == mime_type:
|
|
336
|
+
return f"input{suffix}"
|
|
337
|
+
|
|
338
|
+
if mime_type.startswith("image/"):
|
|
339
|
+
suffix = mime_type.split("/", maxsplit=1)[1].replace("jpeg", "jpg")
|
|
340
|
+
return f"input.{suffix}"
|
|
341
|
+
|
|
342
|
+
return "input.bin"
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _extract_docling_markdown(
|
|
346
|
+
file_bytes: bytes,
|
|
347
|
+
*,
|
|
348
|
+
mime_type: str,
|
|
349
|
+
file_name: str,
|
|
350
|
+
) -> tuple[str, str | None]:
|
|
351
|
+
converter, error = _get_docling_converter()
|
|
352
|
+
if error:
|
|
353
|
+
return "", error
|
|
354
|
+
if converter is None:
|
|
355
|
+
return "", "Docling converter unavailable"
|
|
356
|
+
|
|
357
|
+
temp_fname = _temp_file_name(file_name, mime_type)
|
|
358
|
+
try:
|
|
359
|
+
with tempfile.TemporaryDirectory(prefix="classifyre-docling-") as temp_dir:
|
|
360
|
+
temp_path = Path(temp_dir) / temp_fname
|
|
361
|
+
temp_path.write_bytes(file_bytes)
|
|
362
|
+
with _docling_conversion_sem:
|
|
363
|
+
result = converter.convert(temp_path) # type: ignore[union-attr]
|
|
364
|
+
text = result.document.export_to_markdown().strip()
|
|
365
|
+
page_count = len(result.document.pages) if hasattr(result.document, "pages") else None
|
|
366
|
+
logger.info(
|
|
367
|
+
"OCR extracted %d chars from %s (%s%s)",
|
|
368
|
+
len(text),
|
|
369
|
+
file_name or mime_type,
|
|
370
|
+
mime_type,
|
|
371
|
+
f", {page_count} pages" if page_count else "",
|
|
372
|
+
)
|
|
373
|
+
return text, None
|
|
374
|
+
except Exception as exc:
|
|
375
|
+
return "", f"Docling extraction failed: {exc}"
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def extract_text(
|
|
379
|
+
file_bytes: bytes,
|
|
380
|
+
mime_type: str,
|
|
381
|
+
*,
|
|
382
|
+
file_name: str = "",
|
|
383
|
+
enable_ocr: bool = False,
|
|
384
|
+
) -> tuple[str, str | None]:
|
|
385
|
+
"""
|
|
386
|
+
Extract plain text from file bytes based on MIME type.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
(text_content, error_message_or_None)
|
|
390
|
+
"""
|
|
391
|
+
if enable_ocr and _supports_docling_ocr(mime_type, file_name):
|
|
392
|
+
# PDFs: try cheap native text extraction first. Only hand off to the
|
|
393
|
+
# heavy docling pipeline when the native path yields too little text,
|
|
394
|
+
# which indicates a scanned or image-only PDF that genuinely needs OCR.
|
|
395
|
+
# This avoids loading the ~1 GB StandardPdfPipeline for the majority of
|
|
396
|
+
# PDFs that already carry a text layer.
|
|
397
|
+
if mime_type == "application/pdf":
|
|
398
|
+
cheap_text, cheap_error = _extract_pdf_text(file_bytes)
|
|
399
|
+
if len(cheap_text.strip()) >= _MIN_NATIVE_PDF_CHARS:
|
|
400
|
+
logger.info(
|
|
401
|
+
"OCR extracted %d chars from %s (%s, native text layer)",
|
|
402
|
+
len(cheap_text.strip()),
|
|
403
|
+
file_name or mime_type,
|
|
404
|
+
mime_type,
|
|
405
|
+
)
|
|
406
|
+
return cheap_text, cheap_error
|
|
407
|
+
# Images, DOCX, PPTX, and sparse/scanned PDFs: use docling.
|
|
408
|
+
text, error = _extract_docling_markdown(
|
|
409
|
+
file_bytes,
|
|
410
|
+
mime_type=mime_type,
|
|
411
|
+
file_name=file_name,
|
|
412
|
+
)
|
|
413
|
+
if text:
|
|
414
|
+
return text, None
|
|
415
|
+
if error:
|
|
416
|
+
logger.warning("OCR extraction failed for %s: %s", file_name or mime_type, error)
|
|
417
|
+
|
|
418
|
+
# Binary media types — no text extraction
|
|
419
|
+
if mime_type.startswith(("image/", "audio/", "video/")):
|
|
420
|
+
return "", None
|
|
421
|
+
|
|
422
|
+
# PDF
|
|
423
|
+
if mime_type == "application/pdf":
|
|
424
|
+
return _extract_pdf_text(file_bytes)
|
|
425
|
+
|
|
426
|
+
# DOCX
|
|
427
|
+
if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
|
428
|
+
try:
|
|
429
|
+
import io
|
|
430
|
+
|
|
431
|
+
import docx
|
|
432
|
+
|
|
433
|
+
doc = docx.Document(io.BytesIO(file_bytes))
|
|
434
|
+
parts: list[str] = []
|
|
435
|
+
for para in doc.paragraphs:
|
|
436
|
+
if para.text.strip():
|
|
437
|
+
parts.append(para.text)
|
|
438
|
+
for table in doc.tables:
|
|
439
|
+
for row in table.rows:
|
|
440
|
+
for cell in row.cells:
|
|
441
|
+
if cell.text.strip():
|
|
442
|
+
parts.append(cell.text)
|
|
443
|
+
return "\n".join(parts), None
|
|
444
|
+
except Exception as e:
|
|
445
|
+
return "", f"DOCX extraction failed: {e}"
|
|
446
|
+
|
|
447
|
+
# XLSX
|
|
448
|
+
if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
|
449
|
+
try:
|
|
450
|
+
import io
|
|
451
|
+
|
|
452
|
+
import openpyxl
|
|
453
|
+
|
|
454
|
+
wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
|
|
455
|
+
rows: list[str] = []
|
|
456
|
+
for sheet in wb.worksheets:
|
|
457
|
+
for row in sheet.iter_rows(values_only=True):
|
|
458
|
+
cells = [str(c) if c is not None else "" for c in row]
|
|
459
|
+
if any(c.strip() for c in cells):
|
|
460
|
+
rows.append("\t".join(cells))
|
|
461
|
+
return "\n".join(rows), None
|
|
462
|
+
except Exception as e:
|
|
463
|
+
return "", f"XLSX extraction failed: {e}"
|
|
464
|
+
|
|
465
|
+
# HTML / XHTML
|
|
466
|
+
if mime_type in ("text/html", "application/xhtml+xml"):
|
|
467
|
+
try:
|
|
468
|
+
from .content_extraction import html_to_text
|
|
469
|
+
|
|
470
|
+
text = _decode_bytes(file_bytes)
|
|
471
|
+
return html_to_text(text), None
|
|
472
|
+
except Exception as e:
|
|
473
|
+
return "", f"HTML extraction failed: {e}"
|
|
474
|
+
|
|
475
|
+
# JSON, XML — decode as-is
|
|
476
|
+
if mime_type in (
|
|
477
|
+
"application/json",
|
|
478
|
+
"application/xml",
|
|
479
|
+
"text/xml",
|
|
480
|
+
):
|
|
481
|
+
return _decode_bytes(file_bytes), None
|
|
482
|
+
|
|
483
|
+
# Plain text, CSV, Markdown, and other text/* types
|
|
484
|
+
if mime_type.startswith("text/"):
|
|
485
|
+
return _decode_bytes(file_bytes), None
|
|
486
|
+
|
|
487
|
+
# Parquet
|
|
488
|
+
if mime_type in ("application/parquet", "application/vnd.apache.parquet"):
|
|
489
|
+
try:
|
|
490
|
+
import io
|
|
491
|
+
|
|
492
|
+
import pyarrow.parquet as pq # type: ignore[import-not-found, import-untyped]
|
|
493
|
+
|
|
494
|
+
table = pq.read_table(io.BytesIO(file_bytes))
|
|
495
|
+
column_names = table.schema.names
|
|
496
|
+
lines: list[str] = []
|
|
497
|
+
for row_index in range(table.num_rows):
|
|
498
|
+
lines.append(f"row_{row_index + 1}:")
|
|
499
|
+
for col in column_names:
|
|
500
|
+
col_array = table.column(col)
|
|
501
|
+
cell = col_array[row_index].as_py()
|
|
502
|
+
cell_str = "" if cell is None else str(cell)
|
|
503
|
+
rendered_lines = cell_str.splitlines() or [""]
|
|
504
|
+
first_line, *continuation_lines = rendered_lines
|
|
505
|
+
lines.append(f" {col}: {first_line}")
|
|
506
|
+
for cont in continuation_lines:
|
|
507
|
+
lines.append(f" {cont}")
|
|
508
|
+
lines.append("")
|
|
509
|
+
return "\n".join(lines), None
|
|
510
|
+
except Exception as e:
|
|
511
|
+
return "", f"Parquet extraction failed: {e}"
|
|
512
|
+
|
|
513
|
+
# Unknown / binary
|
|
514
|
+
return "", None
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def _decode_bytes(file_bytes: bytes) -> str:
|
|
518
|
+
"""Decode bytes to str using chardet for encoding detection."""
|
|
519
|
+
try:
|
|
520
|
+
import chardet
|
|
521
|
+
|
|
522
|
+
detected = chardet.detect(file_bytes[:65536])
|
|
523
|
+
encoding = detected.get("encoding") or "utf-8"
|
|
524
|
+
return file_bytes.decode(encoding, errors="replace")
|
|
525
|
+
except Exception:
|
|
526
|
+
return file_bytes.decode("utf-8", errors="replace")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def resolve_mime_type(
|
|
530
|
+
file_bytes: bytes,
|
|
531
|
+
*,
|
|
532
|
+
declared_mime_type: str | None = None,
|
|
533
|
+
file_name: str = "",
|
|
534
|
+
) -> str:
|
|
535
|
+
"""
|
|
536
|
+
Resolve effective MIME type from declared hint, magic-byte detection, and file extension.
|
|
537
|
+
|
|
538
|
+
Kept separate from full parsing so callers can detect format cheaply without
|
|
539
|
+
paying for text extraction (e.g. when content will be streamed in pages later).
|
|
540
|
+
"""
|
|
541
|
+
declared_mime = _normalize_mime_type(declared_mime_type)
|
|
542
|
+
detected_mime = _normalize_mime_type(detect_mime_type(file_bytes))
|
|
543
|
+
inferred_mime = infer_mime_type_from_file_name(file_name)
|
|
544
|
+
|
|
545
|
+
if declared_mime and declared_mime != "application/octet-stream":
|
|
546
|
+
mime_type = declared_mime
|
|
547
|
+
elif detected_mime and detected_mime != "application/octet-stream":
|
|
548
|
+
mime_type = detected_mime
|
|
549
|
+
elif inferred_mime and inferred_mime != "application/octet-stream":
|
|
550
|
+
mime_type = inferred_mime
|
|
551
|
+
else:
|
|
552
|
+
mime_type = declared_mime or detected_mime or inferred_mime or "application/octet-stream"
|
|
553
|
+
|
|
554
|
+
mime_type = normalize_detected_mime_type(mime_type, file_name)
|
|
555
|
+
if mime_type == "application/octet-stream" and inferred_mime != "application/octet-stream":
|
|
556
|
+
mime_type = inferred_mime
|
|
557
|
+
|
|
558
|
+
return mime_type
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def parse_bytes(
|
|
562
|
+
file_bytes: bytes,
|
|
563
|
+
*,
|
|
564
|
+
declared_mime_type: str | None = None,
|
|
565
|
+
file_name: str = "",
|
|
566
|
+
enable_ocr: bool = False,
|
|
567
|
+
) -> ParsedBytes:
|
|
568
|
+
"""
|
|
569
|
+
Parse in-memory bytes: resolve MIME type and extract raw/text content.
|
|
570
|
+
|
|
571
|
+
Used by the sandbox and any caller that needs a complete ParsedBytes in one shot.
|
|
572
|
+
Object-storage sources prefer resolve_mime_type() + iter_file_pages() to avoid
|
|
573
|
+
loading all content into memory before detector scanning.
|
|
574
|
+
"""
|
|
575
|
+
file_size_bytes = len(file_bytes)
|
|
576
|
+
mime_type = resolve_mime_type(
|
|
577
|
+
file_bytes, declared_mime_type=declared_mime_type, file_name=file_name
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
text_content, parse_error = extract_text(
|
|
581
|
+
file_bytes,
|
|
582
|
+
mime_type,
|
|
583
|
+
file_name=file_name,
|
|
584
|
+
enable_ocr=enable_ocr,
|
|
585
|
+
)
|
|
586
|
+
raw_content = _decode_bytes(file_bytes) if _is_text_like_mime_type(mime_type) else ""
|
|
587
|
+
|
|
588
|
+
if mime_type in {"text/html", "application/xhtml+xml"} and raw_content and not text_content:
|
|
589
|
+
from .content_extraction import html_to_text
|
|
590
|
+
|
|
591
|
+
text_content = html_to_text(raw_content)
|
|
592
|
+
|
|
593
|
+
is_binary = (
|
|
594
|
+
mime_type.startswith(("image/", "audio/", "video/"))
|
|
595
|
+
or mime_type == "application/octet-stream"
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
return ParsedBytes(
|
|
599
|
+
mime_type=mime_type,
|
|
600
|
+
raw_content=raw_content,
|
|
601
|
+
text_content=text_content,
|
|
602
|
+
is_binary=is_binary,
|
|
603
|
+
file_size_bytes=file_size_bytes,
|
|
604
|
+
parse_error=parse_error,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def iter_file_pages(
|
|
609
|
+
file_bytes: bytes,
|
|
610
|
+
mime_type: str,
|
|
611
|
+
batch_size: int = 100,
|
|
612
|
+
include_column_names: bool = True,
|
|
613
|
+
*,
|
|
614
|
+
file_name: str = "",
|
|
615
|
+
enable_ocr: bool = False,
|
|
616
|
+
) -> Generator[str, None, None]:
|
|
617
|
+
"""
|
|
618
|
+
Iterate over file content in pages of up to batch_size rows or lines.
|
|
619
|
+
|
|
620
|
+
Parquet / CSV / TSV → yields batch_size *rows* per page with labelled columns.
|
|
621
|
+
All other extractable types (PDF, DOCX, TXT, JSON, XML, XLSX, …) → extracts the
|
|
622
|
+
full text once via extract_text(), then yields batch_size *lines* per page.
|
|
623
|
+
Non-extractable types (images, audio, video, unknown binary) → yields nothing.
|
|
624
|
+
|
|
625
|
+
New file formats only need to be added to extract_text() — not here.
|
|
626
|
+
"""
|
|
627
|
+
normalized = _normalize_mime_type(mime_type)
|
|
628
|
+
|
|
629
|
+
if normalized in ("application/parquet", "application/vnd.apache.parquet"):
|
|
630
|
+
yield from _iter_parquet_pages(file_bytes, batch_size, include_column_names)
|
|
631
|
+
elif normalized in ("text/csv", "text/tab-separated-values"):
|
|
632
|
+
yield from _iter_csv_pages(file_bytes, include_column_names)
|
|
633
|
+
else:
|
|
634
|
+
text, error = extract_text(
|
|
635
|
+
file_bytes,
|
|
636
|
+
normalized,
|
|
637
|
+
file_name=file_name,
|
|
638
|
+
enable_ocr=enable_ocr,
|
|
639
|
+
)
|
|
640
|
+
if error:
|
|
641
|
+
logger.warning("Text extraction error (%s): %s", mime_type, error)
|
|
642
|
+
if text:
|
|
643
|
+
yield from _iter_text_lines(text, batch_size)
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def _iter_text_lines(text: str, batch_size: int) -> Generator[str, None, None]:
|
|
647
|
+
"""Yield non-empty text in chunks of batch_size lines."""
|
|
648
|
+
lines = text.splitlines(keepends=True)
|
|
649
|
+
for start in range(0, len(lines), batch_size):
|
|
650
|
+
chunk = "".join(lines[start : start + batch_size])
|
|
651
|
+
if chunk.strip():
|
|
652
|
+
yield chunk
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
_PARQUET_MAGIC = b"PAR1"
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def _iter_parquet_pages(
|
|
659
|
+
file_bytes: bytes,
|
|
660
|
+
batch_size: int,
|
|
661
|
+
include_column_names: bool,
|
|
662
|
+
) -> Generator[str, None, None]:
|
|
663
|
+
# Parquet files begin AND end with the 4-byte magic "PAR1". If the footer
|
|
664
|
+
# is missing the bytes were truncated mid-download; pyarrow's C++ thread
|
|
665
|
+
# pool will hang indefinitely trying to read schema metadata that isn't
|
|
666
|
+
# there, locking all worker threads on a futex. Bail out early instead.
|
|
667
|
+
if len(file_bytes) < 8 or file_bytes[-4:] != _PARQUET_MAGIC:
|
|
668
|
+
logger.warning(
|
|
669
|
+
"Parquet bytes appear truncated (footer magic missing, %d bytes); skipping",
|
|
670
|
+
len(file_bytes),
|
|
671
|
+
)
|
|
672
|
+
return
|
|
673
|
+
|
|
674
|
+
try:
|
|
675
|
+
import io
|
|
676
|
+
|
|
677
|
+
import pyarrow.parquet as pq # type: ignore[import-not-found, import-untyped]
|
|
678
|
+
|
|
679
|
+
# ParquetFile + iter_batches() reads one row-group at a time instead of
|
|
680
|
+
# loading the whole table into memory, and surfaces schema errors early
|
|
681
|
+
# (before reading any data) so a bad file can't lock the C++ thread pool.
|
|
682
|
+
pf = pq.ParquetFile(io.BytesIO(file_bytes))
|
|
683
|
+
abs_row = 0
|
|
684
|
+
for batch in pf.iter_batches(batch_size=batch_size):
|
|
685
|
+
col_names = batch.schema.names
|
|
686
|
+
for local_idx in range(batch.num_rows):
|
|
687
|
+
lines: list[str] = []
|
|
688
|
+
lines.append(f"row_{abs_row + 1}:")
|
|
689
|
+
for col_i, col in enumerate(col_names):
|
|
690
|
+
cell = batch.column(col_i)[local_idx].as_py()
|
|
691
|
+
cell_str = "" if cell is None else str(cell)
|
|
692
|
+
first, *rest = cell_str.splitlines() or [""]
|
|
693
|
+
lines.append(f" {col}: {first}" if include_column_names else f" {first}")
|
|
694
|
+
lines.extend(f" {c}" for c in rest)
|
|
695
|
+
lines.append("")
|
|
696
|
+
abs_row += 1
|
|
697
|
+
if lines:
|
|
698
|
+
yield "\n".join(lines)
|
|
699
|
+
except Exception as exc:
|
|
700
|
+
logger.warning("Parquet page iteration failed: %s", exc)
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def _iter_csv_pages(
|
|
704
|
+
file_bytes: bytes,
|
|
705
|
+
include_column_names: bool,
|
|
706
|
+
) -> Generator[str, None, None]:
|
|
707
|
+
import csv
|
|
708
|
+
import io
|
|
709
|
+
|
|
710
|
+
try:
|
|
711
|
+
text = _decode_bytes(file_bytes)
|
|
712
|
+
reader = csv.DictReader(io.StringIO(text))
|
|
713
|
+
headers = list(reader.fieldnames or [])
|
|
714
|
+
|
|
715
|
+
total_seen = 0
|
|
716
|
+
|
|
717
|
+
for row in reader:
|
|
718
|
+
total_seen += 1
|
|
719
|
+
yield _format_tabular_page([dict(row)], headers, total_seen, include_column_names)
|
|
720
|
+
except Exception as exc:
|
|
721
|
+
logger.warning("CSV page iteration failed: %s", exc)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def _format_tabular_page(
|
|
725
|
+
rows: list[dict[str, str]],
|
|
726
|
+
headers: list[str],
|
|
727
|
+
abs_row_start: int,
|
|
728
|
+
include_column_names: bool,
|
|
729
|
+
) -> str:
|
|
730
|
+
lines: list[str] = []
|
|
731
|
+
for i, row in enumerate(rows):
|
|
732
|
+
lines.append(f"row_{abs_row_start + i}:")
|
|
733
|
+
for col in headers:
|
|
734
|
+
first, *rest = (row.get(col) or "").splitlines() or [""]
|
|
735
|
+
lines.append(f" {col}: {first}" if include_column_names else f" {first}")
|
|
736
|
+
lines.extend(f" {c}" for c in rest)
|
|
737
|
+
lines.append("")
|
|
738
|
+
return "\n".join(lines)
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def parse_file(file_path: Path, *, enable_ocr: bool = False) -> ParsedFile:
|
|
742
|
+
"""
|
|
743
|
+
Parse a local file: detect MIME type and extract text.
|
|
744
|
+
|
|
745
|
+
Args:
|
|
746
|
+
file_path: Path to the file on disk.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
ParsedFile with mime_type, text_content, is_binary, etc.
|
|
750
|
+
|
|
751
|
+
Raises:
|
|
752
|
+
FileNotFoundError: If file_path does not exist.
|
|
753
|
+
"""
|
|
754
|
+
if not file_path.exists():
|
|
755
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
756
|
+
|
|
757
|
+
file_bytes = file_path.read_bytes()
|
|
758
|
+
parsed = parse_bytes(file_bytes, file_name=file_path.name, enable_ocr=enable_ocr)
|
|
759
|
+
|
|
760
|
+
encoding: str | None = None
|
|
761
|
+
if not parsed.is_binary and parsed.text_content:
|
|
762
|
+
try:
|
|
763
|
+
import chardet
|
|
764
|
+
|
|
765
|
+
detected = chardet.detect(file_bytes[:65536])
|
|
766
|
+
encoding = detected.get("encoding")
|
|
767
|
+
except Exception:
|
|
768
|
+
pass
|
|
769
|
+
|
|
770
|
+
return ParsedFile(
|
|
771
|
+
mime_type=parsed.mime_type,
|
|
772
|
+
text_content=parsed.text_content,
|
|
773
|
+
is_binary=parsed.is_binary,
|
|
774
|
+
file_size_bytes=parsed.file_size_bytes,
|
|
775
|
+
encoding=encoding,
|
|
776
|
+
parse_error=parsed.parse_error,
|
|
777
|
+
)
|