classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,777 @@
1
+ """MIME detection and text extraction utilities for local file parsing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import tempfile
7
+ import threading
8
+ from collections.abc import Generator
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from urllib.parse import urlsplit
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class ParsedFile:
18
+ """Result of parsing a local file."""
19
+
20
+ mime_type: str
21
+ text_content: str
22
+ is_binary: bool
23
+ file_size_bytes: int = 0
24
+ encoding: str | None = None
25
+ parse_error: str | None = None
26
+
27
+
28
+ @dataclass
29
+ class ParsedBytes:
30
+ """Result of parsing in-memory bytes."""
31
+
32
+ mime_type: str
33
+ raw_content: str
34
+ text_content: str
35
+ is_binary: bool
36
+ file_size_bytes: int
37
+ parse_error: str | None = None
38
+
39
+
40
+ _TEXT_RAW_MIME_TYPES = {
41
+ "application/json",
42
+ "application/xml",
43
+ "text/xml",
44
+ "application/xhtml+xml",
45
+ }
46
+
47
+ _TABULAR_MIME_TYPES = {
48
+ "text/csv",
49
+ "text/tab-separated-values",
50
+ "application/vnd.ms-excel",
51
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
52
+ "application/parquet",
53
+ "application/vnd.apache.parquet",
54
+ }
55
+
56
+ _MIME_HINTS_BY_EXTENSION = {
57
+ ".csv": "text/csv",
58
+ ".tsv": "text/tab-separated-values",
59
+ ".parquet": "application/parquet",
60
+ ".json": "application/json",
61
+ ".xml": "application/xml",
62
+ ".html": "text/html",
63
+ ".htm": "text/html",
64
+ ".md": "text/markdown",
65
+ ".txt": "text/plain",
66
+ ".pdf": "application/pdf",
67
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
68
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
69
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
70
+ ".png": "image/png",
71
+ ".jpg": "image/jpeg",
72
+ ".jpeg": "image/jpeg",
73
+ ".gif": "image/gif",
74
+ ".bmp": "image/bmp",
75
+ ".tif": "image/tiff",
76
+ ".tiff": "image/tiff",
77
+ ".webp": "image/webp",
78
+ }
79
+
80
+ _DOCLING_IMAGE_MIME_TYPES = {
81
+ "image/png",
82
+ "image/jpeg",
83
+ "image/tiff",
84
+ "image/bmp",
85
+ "image/webp",
86
+ }
87
+
88
+ _DOCLING_MIME_TYPES = {
89
+ "application/pdf",
90
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
91
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
92
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
93
+ "text/html",
94
+ "application/xhtml+xml",
95
+ }
96
+ _DOCLING_EXTENSIONS = {
97
+ ".pdf",
98
+ ".docx",
99
+ ".pptx",
100
+ ".xlsx",
101
+ ".html",
102
+ ".htm",
103
+ ".png",
104
+ ".jpg",
105
+ ".jpeg",
106
+ ".bmp",
107
+ ".tif",
108
+ ".tiff",
109
+ ".webp",
110
+ }
111
+
112
+
113
+ class _DoclingState:
114
+ """Mutable singleton state for the Docling DocumentConverter.
115
+
116
+ Stored as object attributes so functions can mutate state without `global`
117
+ statements (which ruff PLW0603 disallows). Initializing the converter is
118
+ expensive (loads ML models), so it happens exactly once per process.
119
+ """
120
+
121
+ def __init__(self) -> None:
122
+ self.converter: object = None
123
+ self.error: str | None = None
124
+ self.attempted: bool = False
125
+
126
+
127
+ _docling_state = _DoclingState()
128
+ _docling_lock = threading.Lock()
129
+ # Limits concurrent converter.convert() calls to prevent OOM. The docling
130
+ # StandardPdfPipeline alone holds ~1 GB of model weights; additional working
131
+ # memory per in-flight conversion can push the process over the 4 GiB K8s
132
+ # limit when two conversions run simultaneously.
133
+ _docling_conversion_sem = threading.Semaphore(1)
134
+
135
+
136
+ def _get_docling_converter() -> tuple[object, str | None]:
137
+ """Return a cached DocumentConverter, initializing it on the first call."""
138
+ # Fast-path: only skip the lock once initialization is settled (converter
139
+ # ready or permanently failed). Checking `attempted` alone is not enough —
140
+ # `attempted` is set before the install+init finishes, so threads that reach
141
+ # here while another thread holds the lock would return (None, None) and
142
+ # emit a spurious "unavailable" warning instead of waiting.
143
+ if _docling_state.converter is not None or _docling_state.error is not None:
144
+ return _docling_state.converter, _docling_state.error
145
+ with _docling_lock:
146
+ if _docling_state.attempted:
147
+ return _docling_state.converter, _docling_state.error
148
+ _docling_state.attempted = True
149
+ try:
150
+ from ..sources.dependencies import require_module
151
+
152
+ converter_module = require_module(
153
+ "docling.document_converter",
154
+ "file parser OCR",
155
+ ["ocr"],
156
+ detail="OCR extraction requires the Docling optional dependency.",
157
+ )
158
+ _docling_state.converter = converter_module.DocumentConverter()
159
+ except Exception as exc:
160
+ _docling_state.error = str(exc)
161
+ return _docling_state.converter, _docling_state.error
162
+
163
+
164
+ def _reset_docling_singleton() -> None:
165
+ """Reset the cached Docling converter. Intended for test isolation only."""
166
+ with _docling_lock:
167
+ _docling_state.converter = None
168
+ _docling_state.error = None
169
+ _docling_state.attempted = False
170
+
171
+
172
+ def _normalize_mime_type(mime_type: str | None) -> str:
173
+ if not mime_type:
174
+ return ""
175
+ return str(mime_type).split(";", 1)[0].strip().lower()
176
+
177
+
178
+ def _file_extension(file_name: str) -> str:
179
+ if not file_name:
180
+ return ""
181
+ path = urlsplit(file_name).path
182
+ value = path if path else file_name
183
+ return Path(value).suffix.lower()
184
+
185
+
186
+ def infer_mime_type_from_file_name(file_name: str) -> str:
187
+ """Infer MIME type from file name or URL path extension."""
188
+ extension = _file_extension(file_name)
189
+ return _MIME_HINTS_BY_EXTENSION.get(extension, "application/octet-stream")
190
+
191
+
192
+ def normalize_detected_mime_type(detected_mime_type: str, file_name: str) -> str:
193
+ """
194
+ Normalize detected MIME with filename-based fallbacks.
195
+
196
+ Keeps parser behavior stable for sources that declare generic or plain-text
197
+ content-types for tabular files.
198
+ """
199
+ mime = _normalize_mime_type(detected_mime_type)
200
+ inferred_mime = infer_mime_type_from_file_name(file_name)
201
+
202
+ if not mime or mime == "application/octet-stream":
203
+ return inferred_mime
204
+
205
+ if mime == "text/plain" and inferred_mime in _TABULAR_MIME_TYPES:
206
+ return inferred_mime
207
+
208
+ return mime
209
+
210
+
211
+ def _is_text_like_mime_type(mime_type: str) -> bool:
212
+ normalized_mime = _normalize_mime_type(mime_type)
213
+ return normalized_mime.startswith("text/") or normalized_mime in _TEXT_RAW_MIME_TYPES
214
+
215
+
216
+ def _detect_magic_mime_type(file_bytes: bytes) -> str | None:
217
+ signatures: tuple[tuple[bytes, str], ...] = (
218
+ (b"\x89PNG\r\n\x1a\n", "image/png"),
219
+ (b"%PDF-", "application/pdf"),
220
+ (b"\xff\xd8\xff", "image/jpeg"),
221
+ (b"GIF87a", "image/gif"),
222
+ (b"GIF89a", "image/gif"),
223
+ (b"PK\x03\x04", "application/zip"),
224
+ )
225
+
226
+ for signature, mime_type in signatures:
227
+ if file_bytes.startswith(signature):
228
+ return mime_type
229
+
230
+ return None
231
+
232
+
233
+ def _sniff_text_mime(file_bytes: bytes) -> str:
234
+ """Fallback MIME detection for text formats not handled by filetype."""
235
+ # Check for null bytes → binary
236
+ if b"\x00" in file_bytes[:8192]:
237
+ return "application/octet-stream"
238
+
239
+ # Try to decode a sample for text-based sniffing
240
+ sample = ""
241
+ try:
242
+ import chardet
243
+
244
+ detected = chardet.detect(file_bytes[:4096])
245
+ encoding = detected.get("encoding") or "utf-8"
246
+ sample = file_bytes[:4096].decode(encoding, errors="replace")
247
+ except Exception:
248
+ try:
249
+ sample = file_bytes[:4096].decode("utf-8", errors="replace")
250
+ except Exception:
251
+ return "application/octet-stream"
252
+
253
+ stripped = sample.lstrip()
254
+
255
+ if stripped.startswith("{") or stripped.startswith("["):
256
+ return "application/json"
257
+ if stripped.startswith("<?xml"):
258
+ return "application/xml"
259
+ if stripped.lower().startswith("<!doctype html") or stripped.lower().startswith("<html"):
260
+ return "text/html"
261
+
262
+ # CSV heuristic: first non-empty line has multiple commas
263
+ first_line = stripped.split("\n")[0] if "\n" in stripped else stripped
264
+ if first_line.count(",") >= 2:
265
+ return "text/csv"
266
+
267
+ return "text/plain"
268
+
269
+
270
+ def detect_mime_type(file_bytes: bytes) -> str:
271
+ """
272
+ Detect MIME type from file bytes.
273
+
274
+ Uses magic-byte detection first (filetype library), then falls back to
275
+ text-based sniffing for formats that filetype doesn't cover.
276
+ """
277
+ if not file_bytes:
278
+ return "application/octet-stream"
279
+
280
+ magic_mime_type = _detect_magic_mime_type(file_bytes)
281
+ if magic_mime_type:
282
+ return magic_mime_type
283
+
284
+ try:
285
+ import filetype
286
+
287
+ kind = filetype.guess(file_bytes)
288
+ if kind is not None:
289
+ return str(kind.mime)
290
+ except Exception as e:
291
+ logger.debug(f"filetype detection failed: {e}")
292
+
293
+ return _sniff_text_mime(file_bytes)
294
+
295
+
296
+ def _supports_docling_ocr(mime_type: str, file_name: str) -> bool:
297
+ normalized = _normalize_mime_type(mime_type)
298
+ if normalized in _DOCLING_IMAGE_MIME_TYPES:
299
+ return True
300
+ if normalized in _DOCLING_MIME_TYPES:
301
+ return True
302
+ return _file_extension(file_name) in _DOCLING_EXTENSIONS
303
+
304
+
305
+ # PDFs with fewer extracted chars than this are likely scanned/image-only and
306
+ # need the full docling OCR pipeline. Most text-layer PDFs yield hundreds of
307
+ # chars; a threshold of 50 is conservative enough to never skip real content.
308
+ _MIN_NATIVE_PDF_CHARS = 50
309
+
310
+
311
+ def _extract_pdf_text(file_bytes: bytes) -> tuple[str, str | None]:
312
+ """Extract text from a PDF using pdfplumber (no ML models required)."""
313
+ try:
314
+ import io
315
+
316
+ import pdfplumber
317
+
318
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
319
+ pages = []
320
+ for page in pdf.pages:
321
+ text = page.extract_text() or ""
322
+ if text:
323
+ pages.append(text)
324
+ return "\n\n".join(pages), None
325
+ except Exception as e:
326
+ return "", f"PDF extraction failed: {e}"
327
+
328
+
329
+ def _temp_file_name(file_name: str, mime_type: str) -> str:
330
+ extension = _file_extension(file_name)
331
+ if extension:
332
+ return f"input{extension}"
333
+
334
+ for suffix, candidate_mime in _MIME_HINTS_BY_EXTENSION.items():
335
+ if candidate_mime == mime_type:
336
+ return f"input{suffix}"
337
+
338
+ if mime_type.startswith("image/"):
339
+ suffix = mime_type.split("/", maxsplit=1)[1].replace("jpeg", "jpg")
340
+ return f"input.{suffix}"
341
+
342
+ return "input.bin"
343
+
344
+
345
+ def _extract_docling_markdown(
346
+ file_bytes: bytes,
347
+ *,
348
+ mime_type: str,
349
+ file_name: str,
350
+ ) -> tuple[str, str | None]:
351
+ converter, error = _get_docling_converter()
352
+ if error:
353
+ return "", error
354
+ if converter is None:
355
+ return "", "Docling converter unavailable"
356
+
357
+ temp_fname = _temp_file_name(file_name, mime_type)
358
+ try:
359
+ with tempfile.TemporaryDirectory(prefix="classifyre-docling-") as temp_dir:
360
+ temp_path = Path(temp_dir) / temp_fname
361
+ temp_path.write_bytes(file_bytes)
362
+ with _docling_conversion_sem:
363
+ result = converter.convert(temp_path) # type: ignore[union-attr]
364
+ text = result.document.export_to_markdown().strip()
365
+ page_count = len(result.document.pages) if hasattr(result.document, "pages") else None
366
+ logger.info(
367
+ "OCR extracted %d chars from %s (%s%s)",
368
+ len(text),
369
+ file_name or mime_type,
370
+ mime_type,
371
+ f", {page_count} pages" if page_count else "",
372
+ )
373
+ return text, None
374
+ except Exception as exc:
375
+ return "", f"Docling extraction failed: {exc}"
376
+
377
+
378
+ def extract_text(
379
+ file_bytes: bytes,
380
+ mime_type: str,
381
+ *,
382
+ file_name: str = "",
383
+ enable_ocr: bool = False,
384
+ ) -> tuple[str, str | None]:
385
+ """
386
+ Extract plain text from file bytes based on MIME type.
387
+
388
+ Returns:
389
+ (text_content, error_message_or_None)
390
+ """
391
+ if enable_ocr and _supports_docling_ocr(mime_type, file_name):
392
+ # PDFs: try cheap native text extraction first. Only hand off to the
393
+ # heavy docling pipeline when the native path yields too little text,
394
+ # which indicates a scanned or image-only PDF that genuinely needs OCR.
395
+ # This avoids loading the ~1 GB StandardPdfPipeline for the majority of
396
+ # PDFs that already carry a text layer.
397
+ if mime_type == "application/pdf":
398
+ cheap_text, cheap_error = _extract_pdf_text(file_bytes)
399
+ if len(cheap_text.strip()) >= _MIN_NATIVE_PDF_CHARS:
400
+ logger.info(
401
+ "OCR extracted %d chars from %s (%s, native text layer)",
402
+ len(cheap_text.strip()),
403
+ file_name or mime_type,
404
+ mime_type,
405
+ )
406
+ return cheap_text, cheap_error
407
+ # Images, DOCX, PPTX, and sparse/scanned PDFs: use docling.
408
+ text, error = _extract_docling_markdown(
409
+ file_bytes,
410
+ mime_type=mime_type,
411
+ file_name=file_name,
412
+ )
413
+ if text:
414
+ return text, None
415
+ if error:
416
+ logger.warning("OCR extraction failed for %s: %s", file_name or mime_type, error)
417
+
418
+ # Binary media types — no text extraction
419
+ if mime_type.startswith(("image/", "audio/", "video/")):
420
+ return "", None
421
+
422
+ # PDF
423
+ if mime_type == "application/pdf":
424
+ return _extract_pdf_text(file_bytes)
425
+
426
+ # DOCX
427
+ if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
428
+ try:
429
+ import io
430
+
431
+ import docx
432
+
433
+ doc = docx.Document(io.BytesIO(file_bytes))
434
+ parts: list[str] = []
435
+ for para in doc.paragraphs:
436
+ if para.text.strip():
437
+ parts.append(para.text)
438
+ for table in doc.tables:
439
+ for row in table.rows:
440
+ for cell in row.cells:
441
+ if cell.text.strip():
442
+ parts.append(cell.text)
443
+ return "\n".join(parts), None
444
+ except Exception as e:
445
+ return "", f"DOCX extraction failed: {e}"
446
+
447
+ # XLSX
448
+ if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
449
+ try:
450
+ import io
451
+
452
+ import openpyxl
453
+
454
+ wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
455
+ rows: list[str] = []
456
+ for sheet in wb.worksheets:
457
+ for row in sheet.iter_rows(values_only=True):
458
+ cells = [str(c) if c is not None else "" for c in row]
459
+ if any(c.strip() for c in cells):
460
+ rows.append("\t".join(cells))
461
+ return "\n".join(rows), None
462
+ except Exception as e:
463
+ return "", f"XLSX extraction failed: {e}"
464
+
465
+ # HTML / XHTML
466
+ if mime_type in ("text/html", "application/xhtml+xml"):
467
+ try:
468
+ from .content_extraction import html_to_text
469
+
470
+ text = _decode_bytes(file_bytes)
471
+ return html_to_text(text), None
472
+ except Exception as e:
473
+ return "", f"HTML extraction failed: {e}"
474
+
475
+ # JSON, XML — decode as-is
476
+ if mime_type in (
477
+ "application/json",
478
+ "application/xml",
479
+ "text/xml",
480
+ ):
481
+ return _decode_bytes(file_bytes), None
482
+
483
+ # Plain text, CSV, Markdown, and other text/* types
484
+ if mime_type.startswith("text/"):
485
+ return _decode_bytes(file_bytes), None
486
+
487
+ # Parquet
488
+ if mime_type in ("application/parquet", "application/vnd.apache.parquet"):
489
+ try:
490
+ import io
491
+
492
+ import pyarrow.parquet as pq # type: ignore[import-not-found, import-untyped]
493
+
494
+ table = pq.read_table(io.BytesIO(file_bytes))
495
+ column_names = table.schema.names
496
+ lines: list[str] = []
497
+ for row_index in range(table.num_rows):
498
+ lines.append(f"row_{row_index + 1}:")
499
+ for col in column_names:
500
+ col_array = table.column(col)
501
+ cell = col_array[row_index].as_py()
502
+ cell_str = "" if cell is None else str(cell)
503
+ rendered_lines = cell_str.splitlines() or [""]
504
+ first_line, *continuation_lines = rendered_lines
505
+ lines.append(f" {col}: {first_line}")
506
+ for cont in continuation_lines:
507
+ lines.append(f" {cont}")
508
+ lines.append("")
509
+ return "\n".join(lines), None
510
+ except Exception as e:
511
+ return "", f"Parquet extraction failed: {e}"
512
+
513
+ # Unknown / binary
514
+ return "", None
515
+
516
+
517
+ def _decode_bytes(file_bytes: bytes) -> str:
518
+ """Decode bytes to str using chardet for encoding detection."""
519
+ try:
520
+ import chardet
521
+
522
+ detected = chardet.detect(file_bytes[:65536])
523
+ encoding = detected.get("encoding") or "utf-8"
524
+ return file_bytes.decode(encoding, errors="replace")
525
+ except Exception:
526
+ return file_bytes.decode("utf-8", errors="replace")
527
+
528
+
529
+ def resolve_mime_type(
530
+ file_bytes: bytes,
531
+ *,
532
+ declared_mime_type: str | None = None,
533
+ file_name: str = "",
534
+ ) -> str:
535
+ """
536
+ Resolve effective MIME type from declared hint, magic-byte detection, and file extension.
537
+
538
+ Kept separate from full parsing so callers can detect format cheaply without
539
+ paying for text extraction (e.g. when content will be streamed in pages later).
540
+ """
541
+ declared_mime = _normalize_mime_type(declared_mime_type)
542
+ detected_mime = _normalize_mime_type(detect_mime_type(file_bytes))
543
+ inferred_mime = infer_mime_type_from_file_name(file_name)
544
+
545
+ if declared_mime and declared_mime != "application/octet-stream":
546
+ mime_type = declared_mime
547
+ elif detected_mime and detected_mime != "application/octet-stream":
548
+ mime_type = detected_mime
549
+ elif inferred_mime and inferred_mime != "application/octet-stream":
550
+ mime_type = inferred_mime
551
+ else:
552
+ mime_type = declared_mime or detected_mime or inferred_mime or "application/octet-stream"
553
+
554
+ mime_type = normalize_detected_mime_type(mime_type, file_name)
555
+ if mime_type == "application/octet-stream" and inferred_mime != "application/octet-stream":
556
+ mime_type = inferred_mime
557
+
558
+ return mime_type
559
+
560
+
561
+ def parse_bytes(
562
+ file_bytes: bytes,
563
+ *,
564
+ declared_mime_type: str | None = None,
565
+ file_name: str = "",
566
+ enable_ocr: bool = False,
567
+ ) -> ParsedBytes:
568
+ """
569
+ Parse in-memory bytes: resolve MIME type and extract raw/text content.
570
+
571
+ Used by the sandbox and any caller that needs a complete ParsedBytes in one shot.
572
+ Object-storage sources prefer resolve_mime_type() + iter_file_pages() to avoid
573
+ loading all content into memory before detector scanning.
574
+ """
575
+ file_size_bytes = len(file_bytes)
576
+ mime_type = resolve_mime_type(
577
+ file_bytes, declared_mime_type=declared_mime_type, file_name=file_name
578
+ )
579
+
580
+ text_content, parse_error = extract_text(
581
+ file_bytes,
582
+ mime_type,
583
+ file_name=file_name,
584
+ enable_ocr=enable_ocr,
585
+ )
586
+ raw_content = _decode_bytes(file_bytes) if _is_text_like_mime_type(mime_type) else ""
587
+
588
+ if mime_type in {"text/html", "application/xhtml+xml"} and raw_content and not text_content:
589
+ from .content_extraction import html_to_text
590
+
591
+ text_content = html_to_text(raw_content)
592
+
593
+ is_binary = (
594
+ mime_type.startswith(("image/", "audio/", "video/"))
595
+ or mime_type == "application/octet-stream"
596
+ )
597
+
598
+ return ParsedBytes(
599
+ mime_type=mime_type,
600
+ raw_content=raw_content,
601
+ text_content=text_content,
602
+ is_binary=is_binary,
603
+ file_size_bytes=file_size_bytes,
604
+ parse_error=parse_error,
605
+ )
606
+
607
+
608
+ def iter_file_pages(
609
+ file_bytes: bytes,
610
+ mime_type: str,
611
+ batch_size: int = 100,
612
+ include_column_names: bool = True,
613
+ *,
614
+ file_name: str = "",
615
+ enable_ocr: bool = False,
616
+ ) -> Generator[str, None, None]:
617
+ """
618
+ Iterate over file content in pages of up to batch_size rows or lines.
619
+
620
+ Parquet / CSV / TSV → yields batch_size *rows* per page with labelled columns.
621
+ All other extractable types (PDF, DOCX, TXT, JSON, XML, XLSX, …) → extracts the
622
+ full text once via extract_text(), then yields batch_size *lines* per page.
623
+ Non-extractable types (images, audio, video, unknown binary) → yields nothing.
624
+
625
+ New file formats only need to be added to extract_text() — not here.
626
+ """
627
+ normalized = _normalize_mime_type(mime_type)
628
+
629
+ if normalized in ("application/parquet", "application/vnd.apache.parquet"):
630
+ yield from _iter_parquet_pages(file_bytes, batch_size, include_column_names)
631
+ elif normalized in ("text/csv", "text/tab-separated-values"):
632
+ yield from _iter_csv_pages(file_bytes, include_column_names)
633
+ else:
634
+ text, error = extract_text(
635
+ file_bytes,
636
+ normalized,
637
+ file_name=file_name,
638
+ enable_ocr=enable_ocr,
639
+ )
640
+ if error:
641
+ logger.warning("Text extraction error (%s): %s", mime_type, error)
642
+ if text:
643
+ yield from _iter_text_lines(text, batch_size)
644
+
645
+
646
+ def _iter_text_lines(text: str, batch_size: int) -> Generator[str, None, None]:
647
+ """Yield non-empty text in chunks of batch_size lines."""
648
+ lines = text.splitlines(keepends=True)
649
+ for start in range(0, len(lines), batch_size):
650
+ chunk = "".join(lines[start : start + batch_size])
651
+ if chunk.strip():
652
+ yield chunk
653
+
654
+
655
+ _PARQUET_MAGIC = b"PAR1"
656
+
657
+
658
+ def _iter_parquet_pages(
659
+ file_bytes: bytes,
660
+ batch_size: int,
661
+ include_column_names: bool,
662
+ ) -> Generator[str, None, None]:
663
+ # Parquet files begin AND end with the 4-byte magic "PAR1". If the footer
664
+ # is missing the bytes were truncated mid-download; pyarrow's C++ thread
665
+ # pool will hang indefinitely trying to read schema metadata that isn't
666
+ # there, locking all worker threads on a futex. Bail out early instead.
667
+ if len(file_bytes) < 8 or file_bytes[-4:] != _PARQUET_MAGIC:
668
+ logger.warning(
669
+ "Parquet bytes appear truncated (footer magic missing, %d bytes); skipping",
670
+ len(file_bytes),
671
+ )
672
+ return
673
+
674
+ try:
675
+ import io
676
+
677
+ import pyarrow.parquet as pq # type: ignore[import-not-found, import-untyped]
678
+
679
+ # ParquetFile + iter_batches() reads one row-group at a time instead of
680
+ # loading the whole table into memory, and surfaces schema errors early
681
+ # (before reading any data) so a bad file can't lock the C++ thread pool.
682
+ pf = pq.ParquetFile(io.BytesIO(file_bytes))
683
+ abs_row = 0
684
+ for batch in pf.iter_batches(batch_size=batch_size):
685
+ col_names = batch.schema.names
686
+ for local_idx in range(batch.num_rows):
687
+ lines: list[str] = []
688
+ lines.append(f"row_{abs_row + 1}:")
689
+ for col_i, col in enumerate(col_names):
690
+ cell = batch.column(col_i)[local_idx].as_py()
691
+ cell_str = "" if cell is None else str(cell)
692
+ first, *rest = cell_str.splitlines() or [""]
693
+ lines.append(f" {col}: {first}" if include_column_names else f" {first}")
694
+ lines.extend(f" {c}" for c in rest)
695
+ lines.append("")
696
+ abs_row += 1
697
+ if lines:
698
+ yield "\n".join(lines)
699
+ except Exception as exc:
700
+ logger.warning("Parquet page iteration failed: %s", exc)
701
+
702
+
703
+ def _iter_csv_pages(
704
+ file_bytes: bytes,
705
+ include_column_names: bool,
706
+ ) -> Generator[str, None, None]:
707
+ import csv
708
+ import io
709
+
710
+ try:
711
+ text = _decode_bytes(file_bytes)
712
+ reader = csv.DictReader(io.StringIO(text))
713
+ headers = list(reader.fieldnames or [])
714
+
715
+ total_seen = 0
716
+
717
+ for row in reader:
718
+ total_seen += 1
719
+ yield _format_tabular_page([dict(row)], headers, total_seen, include_column_names)
720
+ except Exception as exc:
721
+ logger.warning("CSV page iteration failed: %s", exc)
722
+
723
+
724
+ def _format_tabular_page(
725
+ rows: list[dict[str, str]],
726
+ headers: list[str],
727
+ abs_row_start: int,
728
+ include_column_names: bool,
729
+ ) -> str:
730
+ lines: list[str] = []
731
+ for i, row in enumerate(rows):
732
+ lines.append(f"row_{abs_row_start + i}:")
733
+ for col in headers:
734
+ first, *rest = (row.get(col) or "").splitlines() or [""]
735
+ lines.append(f" {col}: {first}" if include_column_names else f" {first}")
736
+ lines.extend(f" {c}" for c in rest)
737
+ lines.append("")
738
+ return "\n".join(lines)
739
+
740
+
741
+ def parse_file(file_path: Path, *, enable_ocr: bool = False) -> ParsedFile:
742
+ """
743
+ Parse a local file: detect MIME type and extract text.
744
+
745
+ Args:
746
+ file_path: Path to the file on disk.
747
+
748
+ Returns:
749
+ ParsedFile with mime_type, text_content, is_binary, etc.
750
+
751
+ Raises:
752
+ FileNotFoundError: If file_path does not exist.
753
+ """
754
+ if not file_path.exists():
755
+ raise FileNotFoundError(f"File not found: {file_path}")
756
+
757
+ file_bytes = file_path.read_bytes()
758
+ parsed = parse_bytes(file_bytes, file_name=file_path.name, enable_ocr=enable_ocr)
759
+
760
+ encoding: str | None = None
761
+ if not parsed.is_binary and parsed.text_content:
762
+ try:
763
+ import chardet
764
+
765
+ detected = chardet.detect(file_bytes[:65536])
766
+ encoding = detected.get("encoding")
767
+ except Exception:
768
+ pass
769
+
770
+ return ParsedFile(
771
+ mime_type=parsed.mime_type,
772
+ text_content=parsed.text_content,
773
+ is_binary=parsed.is_binary,
774
+ file_size_bytes=parsed.file_size_bytes,
775
+ encoding=encoding,
776
+ parse_error=parsed.parse_error,
777
+ )