sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.dist-info/METADATA +416 -0
  41. sirchmunk-0.0.1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,658 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import enum
3
+ import json
4
+ import mimetypes
5
+ import re
6
+ import subprocess
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Literal, Optional, Union
11
+
12
+ from loguru import logger
13
+
14
+ from sirchmunk.llm.openai_chat import OpenAIChat
15
+ from sirchmunk.schema.snapshot import SnapshotInfo
16
+ from sirchmunk.utils.file_utils import get_fast_hash
17
+
18
+
19
+ class FileType(enum.Enum):
20
+ """
21
+ Enumeration of supported file types for specialized schema extraction.
22
+ """
23
+
24
+ PLAINTEXT = "plaintext"
25
+ CSV = "csv"
26
+ JSON = "json"
27
+ IMAGE = "image"
28
+ PDF = "pdf"
29
+ EXCEL = "excel"
30
+ VIDEO = "video"
31
+ OTHER = "other"
32
+ DIRECTORY = "directory"
33
+
34
+
35
+ @dataclass
36
+ class FileInfo:
37
+ """Base file metadata schema for any file or directory on disk.
38
+
39
+ Args:
40
+ file_or_url (Path): Absolute or relative path to the file/directory, or URL.
41
+ type (FileType): Type of the file (e.g., text, image, pdf, directory).
42
+ last_modified (datetime): Last modification time.
43
+ create_time (datetime): Creation time (or metadata change time on Unix).
44
+ do_snapshot (bool): Whether to capture a snapshot for this file.
45
+ snapshot (SnapshotInfo): Snapshot information (if do_snapshot is True).
46
+
47
+ Attributes (computed in __post_init__):
48
+ size_bytes (int): File size in bytes.
49
+ mime_type (str): MIME type inferred from file extension (e.g., "text/plain").
50
+ extension (str): Lowercase file extension (e.g., ".txt").
51
+ md5 (str): MD5 hash of the file. Empty string for directories. We use `md5_head` for quick fingerprinting.
52
+ cache_key (str): Unique cache key based on MD5 and size for change detection.
53
+ """
54
+
55
+ file_or_url: Union[str, Path]
56
+ last_modified: datetime
57
+ create_time: datetime
58
+ type: FileType = field(default=FileType.PLAINTEXT)
59
+ do_snapshot: bool = field(default=True)
60
+ snapshot: SnapshotInfo = field(default_factory=SnapshotInfo)
61
+
62
+ size_bytes: int = field(init=False)
63
+ mime_type: str = field(init=False)
64
+ extension: str = field(init=False)
65
+ md5: str = field(init=False)
66
+ cache_key: str = field(init=False)
67
+
68
+ def __post_init__(self) -> None:
69
+ # TODO: add URLs support
70
+ self.file_or_url = Path(self.file_or_url)
71
+ self.extension = self.file_or_url.suffix.lower()
72
+ self.size_bytes = self.file_or_url.stat().st_size
73
+ self.mime_type = (
74
+ mimetypes.guess_type(self.file_or_url)[0] or "application/octet-stream"
75
+ )
76
+ self.md5 = self.get_file_md5(file_path=self.file_or_url)
77
+ self.cache_key = self.get_cache_key(file_or_url=self.file_or_url)
78
+
79
+ def base_kwargs(self) -> Dict[str, Any]:
80
+ """Return a dict of fields that can be safely passed to child dataclass __init__.
81
+
82
+ Excludes fields with `init=False` (e.g., `extension`, `mime_type`), which are
83
+ computed in __post_init__ and must not be passed during initialization.
84
+
85
+ Returns:
86
+ Dict[str, Any]: A dictionary containing only the init-accepted fields:
87
+ {"path", "type", "size_bytes", "last_modified", "create_time", "do_snapshot"}.
88
+ """
89
+ # Get all fields declared in the dataclass (including inherited)
90
+ init_fields = {f.name for f in self.__dataclass_fields__.values() if f.init}
91
+ # Filter self.__dict__ to only include init-accepted fields
92
+ return {k: v for k, v in self.__dict__.items() if k in init_fields}
93
+
94
+ def to_dict(self):
95
+ """Convert the FileInfo instance to a dictionary.
96
+
97
+ Returns:
98
+ Dict[str, Any]: A dictionary representation of the FileInfo instance.
99
+ """
100
+ return {
101
+ "path": str(self.file_or_url),
102
+ "type": self.type.value,
103
+ "size_bytes": self.size_bytes,
104
+ "last_modified": self.last_modified.isoformat(),
105
+ "create_time": self.create_time.isoformat(),
106
+ "do_snapshot": self.do_snapshot,
107
+ "snapshot": self.snapshot.to_dict(),
108
+ "mime_type": self.mime_type,
109
+ "extension": self.extension,
110
+ "md5": self.md5,
111
+ "cache_key": self.cache_key,
112
+ }
113
+
114
+ @staticmethod
115
+ def from_dict(info: Dict[str, Any]):
116
+ """Create a FileInfo instance from a dictionary.
117
+
118
+ Args:
119
+ info (Dict[str, Any]): A dictionary containing the fields of FileInfo.
120
+
121
+ Returns:
122
+ FileInfo: An instance of FileInfo populated with the provided data.
123
+ """
124
+ return FileInfo(
125
+ file_or_url=Path(info["path"]),
126
+ type=FileType(info["type"]),
127
+ last_modified=datetime.fromisoformat(info["last_modified"]),
128
+ create_time=datetime.fromisoformat(info["create_time"]),
129
+ do_snapshot=info.get("do_snapshot", True),
130
+ )
131
+
132
+ @staticmethod
133
+ def get_file_md5(file_path: Union[str, Path]) -> str:
134
+ """
135
+ Get the MD5 hash of a file if it exists.
136
+ """
137
+ file_path = Path(file_path)
138
+ return get_fast_hash(file_path=file_path) if file_path.is_file() else ""
139
+
140
+ @staticmethod
141
+ def get_cache_key(file_or_url: Union[str, Path]) -> str:
142
+ """Generate a unique cache key for the file based on its path and MD5 hash.
143
+
144
+ Returns:
145
+ str: A unique cache key string.
146
+ """
147
+ md5: str = FileInfo.get_file_md5(file_path=file_or_url)
148
+ size_bytes: int = Path(file_or_url).stat().st_size
149
+ cache_key: str = f"{md5}_{str(size_bytes)}" if md5 else ""
150
+
151
+ return cache_key
152
+
153
+ @staticmethod
154
+ def get_path_mtime(f_path: Union[str, Path], mtime: datetime) -> str:
155
+ """
156
+ Generate a unique identifier for a file based on its path and modification time for `unchanged` checking.
157
+
158
+ Args:
159
+ f_path (Union[str, Path]): The file path.
160
+ mtime (datetime): The last modification time of the file.
161
+
162
+ Returns:
163
+ str: A unique identifier string in the format "path@ISO8601_mtime".
164
+ """
165
+ f_path: str = str(Path(f_path).resolve())
166
+ return f"{f_path}@{mtime.isoformat()}"
167
+
168
+
169
+ @dataclass
170
+ class TextFileSchema(FileInfo):
171
+ """Schema for plain-text-like files (e.g., .txt, .md, .log, .py).
172
+
173
+ Args:
174
+ encoding (Optional[str], optional): Detected character encoding (e.g., "utf-8"). Defaults to None.
175
+ line_count (Optional[int], optional): Total number of lines — *not computed by default*. Defaults to None.
176
+ first_lines_preview (List[str], optional): First few lines (up to 5) for quick inspection. Defaults to empty list.
177
+ """
178
+
179
+ encoding: Optional[str] = None
180
+ line_count: Optional[int] = None
181
+ first_lines_preview: List[str] = field(default_factory=list)
182
+
183
+
184
+ @dataclass
185
+ class CSVFileSchema(FileInfo):
186
+ """Schema for CSV files.
187
+
188
+ Args:
189
+ delimiter (str, optional): Field delimiter detected (e.g., ",", ";"). Defaults to ",".
190
+ has_header (bool, optional): Whether the first row appears to be a header. Defaults to True.
191
+ columns (List[str], optional): List of column names. Empty if header not detected. Defaults to empty list.
192
+ row_count (Optional[int], optional): Estimated or actual row count — *not computed by default*. Defaults to None.
193
+ sample_rows (List[Dict[str, Any]], optional): Sample of up to 3 parsed rows as dictionaries. Defaults to empty list.
194
+ """
195
+
196
+ delimiter: str = field(default=",")
197
+ has_header: bool = field(default=True)
198
+ columns: List[str] = field(default_factory=list)
199
+ row_count: Optional[int] = field(default=None)
200
+ sample_rows: List[Dict[str, Any]] = field(default_factory=list)
201
+
202
+
203
+ @dataclass
204
+ class JSONFileSchema(FileInfo):
205
+ """Schema for JSON files.
206
+
207
+ Args:
208
+ root_type (Literal["object", "array", "scalar"], optional): Type of JSON root element. Defaults to "object".
209
+ inferred_schema (Dict[str, Any], optional): Inferred JSON Schema (structural summary). Defaults to empty dict.
210
+ is_valid_json (bool, optional): Whether the file is syntactically valid JSON. Defaults to True.
211
+ """
212
+
213
+ root_type: Literal["object", "array", "scalar"] = "object"
214
+ inferred_schema: Dict[str, Any] = field(default_factory=dict)
215
+ is_valid_json: bool = field(default=True)
216
+
217
+
218
+ @dataclass
219
+ class ImageFileSchema(FileInfo):
220
+ """Schema for image files (PNG, JPEG, etc.).
221
+
222
+ Args:
223
+ width (int): Image width in pixels.
224
+ height (int): Image height in pixels.
225
+ mode (str): PIL color mode (e.g., "RGB", "RGBA", "L").
226
+ format (str): Image format (e.g., "PNG", "JPEG").
227
+ color_profile (Optional[str], optional): Color profile type if embedded (e.g., "icc"). Defaults to None.
228
+ """
229
+
230
+ width: int = field(default=0)
231
+ height: int = field(default=0)
232
+ mode: str = field(default=None)
233
+ format: str = field(default=None)
234
+ color_profile: Optional[str] = field(default=None)
235
+
236
+
237
+ @dataclass
238
+ class PDFFileSchema(FileInfo):
239
+ """Schema for PDF documents.
240
+
241
+ Args:
242
+ page_count (int): Number of pages in the document.
243
+ author (Optional[str], optional): Document author from metadata. Defaults to None.
244
+ title (Optional[str], optional): Document title from metadata. Defaults to None.
245
+ keywords (List[str], optional): Keywords from metadata. Defaults to empty list.
246
+ is_encrypted (bool, optional): Whether the PDF is encrypted. Defaults to False.
247
+ """
248
+
249
+ page_count: int = field(default=0)
250
+ author: Optional[str] = field(default=None)
251
+ title: Optional[str] = field(default=None)
252
+ keywords: List[str] = field(default_factory=list)
253
+ is_encrypted: bool = field(default=False)
254
+
255
+
256
+ @dataclass
257
+ class ExcelFileSchema(FileInfo):
258
+ """Schema for Excel workbooks (.xlsx, .xls).
259
+
260
+ Args:
261
+ sheet_names (List[str], optional): Names of all sheets in the workbook. Defaults to empty list.
262
+ sheet_schemas (Dict[str, CSVFileSchema], optional): Mapping from sheet name to its CSV-like schema.
263
+ Each sheet is treated as a tabular dataset. Defaults to empty dict.
264
+ """
265
+
266
+ sheet_names: List[str] = field(default_factory=list)
267
+ sheet_schemas: Dict[str, CSVFileSchema] = field(default_factory=dict)
268
+
269
+
270
+ @dataclass
271
+ class VideoFileSchema(FileInfo):
272
+ """Schema for video files (MP4, AVI, MOV, etc.).
273
+
274
+ Extracts metadata via `ffprobe` (part of FFmpeg). Falls back to basic FileInfo if `ffprobe` is unavailable.
275
+
276
+ Args:
277
+ duration_sec (float): Duration of the video in seconds.
278
+ width (int): Frame width in pixels.
279
+ height (int): Frame height in pixels.
280
+ codec (str): Video codec name (e.g., "h264", "hevc", "vp9").
281
+ framerate (float): Frames per second (e.g., 29.97, 60.0).
282
+ bitrate_kbps (int): Average video bitrate in kbps.
283
+ has_audio (bool): Whether the file contains an audio stream.
284
+ audio_codec (Optional[str], optional): Audio codec name if present (e.g., "aac", "mp3"). Defaults to None.
285
+ rotation (int, optional): Display rotation (0, 90, 180, 270) — from metadata tags. Defaults to 0.
286
+ """
287
+
288
+ duration_sec: float = field(default=0.0)
289
+ width: int = field(default=0)
290
+ height: int = field(default=0)
291
+ codec: str = field(default="")
292
+ framerate: float = field(default=0.0)
293
+ bitrate_kbps: int = field(default=0)
294
+ has_audio: bool = field(default=False)
295
+ audio_codec: Optional[str] = field(default=None)
296
+ rotation: int = field(default=0)
297
+
298
+
299
+ def build_file_schema(
300
+ path: Union[str, Path],
301
+ llm: Optional[OpenAIChat] = None,
302
+ ) -> FileInfo:
303
+ """
304
+ Build a typed schema object for a given file path, based on its type.
305
+
306
+ Supports: text, CSV, JSON, images, PDF, Excel, and video files.
307
+ Falls back to basic FileInfo if parsing fails or type is unknown.
308
+
309
+ Args:
310
+ path (Path): Path to the file (must exist).
311
+ llm (OpenAIChat): The llm client of OpenAI api.
312
+
313
+ Returns:
314
+ FileInfo: An instance of FileInfo or one of its subclasses with type-specific metadata.
315
+
316
+ Raises:
317
+ FileNotFoundError: If the path does not exist.
318
+ """
319
+ path = Path(path)
320
+ if not path.exists():
321
+ raise FileNotFoundError(f"Path not found: {path}")
322
+
323
+ stat = path.stat()
324
+ base_info = FileInfo(
325
+ file_or_url=path,
326
+ last_modified=datetime.fromtimestamp(stat.st_mtime),
327
+ create_time=datetime.fromtimestamp(stat.st_ctime),
328
+ do_snapshot=True,
329
+ )
330
+
331
+ if path.is_dir():
332
+ base_info.type = FileType.DIRECTORY
333
+ return base_info
334
+
335
+ try:
336
+ # --- Text-like files ---
337
+ if base_info.extension in {
338
+ ".txt",
339
+ ".md",
340
+ ".log",
341
+ ".py",
342
+ ".json",
343
+ ".yml",
344
+ ".yaml",
345
+ ".xml",
346
+ }:
347
+ base_info.type = FileType.PLAINTEXT
348
+ schema = _build_text_schema(
349
+ file_info=base_info,
350
+ llm=llm,
351
+ )
352
+ if base_info.extension == ".json":
353
+ base_info.type = FileType.JSON
354
+ schema = _build_json_schema(base_info, schema)
355
+ return schema
356
+
357
+ # --- CSV ---
358
+ if base_info.extension == ".csv":
359
+ base_info.type = FileType.CSV
360
+ return _build_csv_schema(base_info)
361
+
362
+ # --- Images ---
363
+ if base_info.mime_type.startswith("image/"):
364
+ base_info.type = FileType.IMAGE
365
+ return _build_image_schema(base_info)
366
+
367
+ # --- PDF ---
368
+ if base_info.extension == ".pdf":
369
+ base_info.type = FileType.PDF
370
+ return _build_pdf_schema(base_info)
371
+
372
+ # --- Excel ---
373
+ if base_info.extension in {".xlsx", ".xls"}:
374
+ base_info.type = FileType.EXCEL
375
+ return _build_excel_schema(base_info)
376
+
377
+ # --- Video ---
378
+ if base_info.mime_type.startswith("video/") or base_info.extension in {
379
+ ".mp4",
380
+ ".avi",
381
+ ".mov",
382
+ ".mkv",
383
+ ".webm",
384
+ ".flv",
385
+ ".wmv",
386
+ }:
387
+ base_info.type = FileType.VIDEO
388
+ video_schema = _build_video_schema(base_info)
389
+ if video_schema:
390
+ return video_schema
391
+
392
+ except Exception as e:
393
+ logger.warning(f"Error building schema for {path}: {e}")
394
+
395
+ # Fallback
396
+ return base_info
397
+
398
+
399
+ def _build_text_schema(
400
+ file_info: FileInfo,
401
+ llm: Optional[OpenAIChat] = None,
402
+ ) -> TextFileSchema:
403
+ """Build TextFileSchema by sampling the beginning of the file."""
404
+ try:
405
+ with open(file_info.file_or_url, "rb") as f:
406
+ raw = f.read(2048)
407
+ encoding = _detect_encoding(raw)
408
+ text = raw.decode(encoding, errors="replace")
409
+ lines = text.splitlines()
410
+
411
+ if file_info.do_snapshot:
412
+ from sirchmunk.schema.snapshot import TextSnapshot
413
+
414
+ snapshot_info: SnapshotInfo = TextSnapshot(
415
+ llm=llm,
416
+ ).sampling(
417
+ file_path=file_info.file_or_url,
418
+ )
419
+ file_info.snapshot = snapshot_info
420
+
421
+ return TextFileSchema(
422
+ **file_info.base_kwargs(),
423
+ encoding=encoding,
424
+ first_lines_preview=lines[:5],
425
+ )
426
+ except Exception as e:
427
+ logger.warning(f"Error building text schema for {file_info.file_or_url}: {e}")
428
+ return TextFileSchema(**file_info.base_kwargs())
429
+
430
+
431
+ def _build_json_schema(file_info: FileInfo, fallback: TextFileSchema) -> JSONFileSchema:
432
+ """Attempt to parse and infer schema for JSON files using Genson."""
433
+ try:
434
+ import json
435
+
436
+ from genson import SchemaBuilder
437
+
438
+ data = json.loads(
439
+ file_info.file_or_url.read_text(encoding=fallback.encoding or "utf-8")
440
+ )
441
+ builder = SchemaBuilder()
442
+ builder.add_object(data)
443
+ inferred = (
444
+ builder.to_schema()
445
+ ) # Returns dict, e.g., {"type": "object", "properties": {...}}
446
+
447
+ return JSONFileSchema(
448
+ **file_info.base_kwargs(),
449
+ root_type=_json_root_type(data),
450
+ inferred_schema=inferred,
451
+ is_valid_json=True,
452
+ )
453
+ except Exception as e:
454
+ logger.warning(f"Error building JSON schema for {file_info.file_or_url}: {e}")
455
+ return JSONFileSchema(**file_info.base_kwargs(), is_valid_json=False)
456
+
457
+
458
+ def _build_csv_schema(file_info: FileInfo) -> CSVFileSchema:
459
+ """Build CSV schema using csv.Sniffer and sample parsing."""
460
+ import csv
461
+
462
+ with open(
463
+ file_info.file_or_url, newline="", encoding="utf-8", errors="ignore"
464
+ ) as f:
465
+ sample = f.read(4096)
466
+ sniffer = csv.Sniffer()
467
+ dialect = sniffer.sniff(sample)
468
+ has_header = sniffer.has_header(sample)
469
+ f.seek(0)
470
+ reader = csv.DictReader(f, dialect=dialect)
471
+ columns = list(reader.fieldnames) if reader.fieldnames else []
472
+ sample_rows = [row for _, row in zip(range(3), reader)]
473
+ return CSVFileSchema(
474
+ **file_info.base_kwargs(),
475
+ delimiter=dialect.delimiter,
476
+ has_header=has_header,
477
+ columns=columns,
478
+ sample_rows=sample_rows,
479
+ )
480
+
481
+
482
+ def _build_image_schema(file_info: FileInfo) -> ImageFileSchema:
483
+ """Build image schema using PIL (lightweight metadata read)."""
484
+ from PIL import Image
485
+
486
+ with Image.open(file_info.file_or_url) as img:
487
+ return ImageFileSchema(
488
+ **file_info.base_kwargs(),
489
+ width=img.width,
490
+ height=img.height,
491
+ mode=img.mode,
492
+ format=img.format or "",
493
+ color_profile="icc" if "icc_profile" in img.info else None,
494
+ )
495
+
496
+
497
+ def _build_pdf_schema(file_info: FileInfo) -> PDFFileSchema:
498
+ """Build PDF schema using pypdf (streaming, no full load)."""
499
+ import pypdf
500
+
501
+ with open(file_info.file_or_url, "rb") as f:
502
+ reader = pypdf.PdfReader(f)
503
+ meta = reader.metadata or {}
504
+ return PDFFileSchema(
505
+ **file_info.base_kwargs(),
506
+ page_count=len(reader.pages),
507
+ author=meta.get("/Author"),
508
+ title=meta.get("/Title"),
509
+ keywords=_parse_pdf_keywords(meta.get("/Keywords")),
510
+ is_encrypted=reader.is_encrypted,
511
+ )
512
+
513
+
514
+ def _build_excel_schema(file_info: FileInfo) -> ExcelFileSchema:
515
+ """Build Excel schema using pandas (reads only headers + few rows)."""
516
+ import pandas as pd
517
+
518
+ sheets = pd.read_excel(file_info.file_or_url, sheet_name=None, nrows=5)
519
+ sheet_schemas = {}
520
+
521
+ base_kwargs = {
522
+ "path": file_info.file_or_url,
523
+ "last_modified": file_info.last_modified,
524
+ "create_time": file_info.create_time,
525
+ "is_dir": False, # sheets are not dirs
526
+ }
527
+
528
+ for name, df in sheets.items():
529
+ cols = df.columns.tolist()
530
+ sample = df.head(3).to_dict(orient="records")
531
+ sheet_schemas[name] = CSVFileSchema(
532
+ **base_kwargs,
533
+ delimiter=",",
534
+ has_header=True,
535
+ columns=cols,
536
+ sample_rows=sample,
537
+ )
538
+
539
+ return ExcelFileSchema(
540
+ **file_info.base_kwargs(),
541
+ sheet_names=list(sheets.keys()),
542
+ sheet_schemas=sheet_schemas,
543
+ )
544
+
545
+
546
+ def _build_video_schema(file_info: FileInfo) -> Optional[VideoFileSchema]:
547
+ """
548
+ Build video schema by calling `ffprobe` to extract metadata.
549
+
550
+ Args:
551
+ file_info (FileInfo): Base file info for the video file.
552
+
553
+ Returns:
554
+ VideoFileSchema if ffprobe succeeds; None otherwise (caller should fall back).
555
+ """
556
+ try:
557
+ cmd = [
558
+ "ffprobe",
559
+ "-v",
560
+ "quiet",
561
+ "-print_format",
562
+ "json",
563
+ "-show_format",
564
+ "-show_streams",
565
+ str(file_info.file_or_url),
566
+ ]
567
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
568
+ if result.returncode != 0:
569
+ return None
570
+
571
+ data = json.loads(result.stdout)
572
+ format_info = data.get("format", {})
573
+ streams = data.get("streams", [])
574
+
575
+ # Find best video stream
576
+ video_stream = next(
577
+ (s for s in streams if s.get("codec_type") == "video"), None
578
+ )
579
+ audio_stream = next(
580
+ (s for s in streams if s.get("codec_type") == "audio"), None
581
+ )
582
+
583
+ if not video_stream:
584
+ return None
585
+
586
+ # Extract key fields with safe defaults
587
+ width = int(video_stream.get("width", 0))
588
+ height = int(video_stream.get("height", 0))
589
+ codec = video_stream.get("codec_name", "unknown")
590
+ duration = float(format_info.get("duration", 0.0))
591
+ bitrate = int(int(format_info.get("bit_rate", 0)) // 1000) # bps → kbps
592
+
593
+ # Framerate: may be "30/1" or "2997/100"
594
+ fps_str = video_stream.get("avg_frame_rate", "0/1")
595
+ if "/" in fps_str:
596
+ num, den = map(int, fps_str.split("/"))
597
+ framerate = num / den if den != 0 else 0.0
598
+ else:
599
+ framerate = float(fps_str or 0.0)
600
+
601
+ # Rotation from metadata tags
602
+ rotation = 0
603
+ tags = video_stream.get("tags", {})
604
+ rotate_tag = tags.get("rotate") or tags.get("ROTATE")
605
+ if rotate_tag and re.match(r"^-?\d+$", rotate_tag):
606
+ rotation = int(rotate_tag) % 360
607
+
608
+ return VideoFileSchema(
609
+ **file_info.base_kwargs(),
610
+ duration_sec=duration,
611
+ width=width,
612
+ height=height,
613
+ codec=codec,
614
+ framerate=framerate,
615
+ bitrate_kbps=bitrate,
616
+ has_audio=audio_stream is not None,
617
+ audio_codec=audio_stream.get("codec_name") if audio_stream else None,
618
+ rotation=rotation,
619
+ )
620
+
621
+ except (
622
+ subprocess.SubprocessError,
623
+ FileNotFoundError,
624
+ json.JSONDecodeError,
625
+ ValueError,
626
+ OSError,
627
+ ) as e:
628
+ logger.warning(f"Error building video schema for {file_info.file_or_url}: {e}")
629
+ return None
630
+
631
+
632
+ def _detect_encoding(raw: bytes) -> str:
633
+ """Detect text encoding from raw bytes using charset_normalizer (fallback to utf-8)."""
634
+ try:
635
+ import charset_normalizer
636
+
637
+ result = charset_normalizer.from_bytes(raw).best()
638
+ return result.encoding if result else "utf-8"
639
+ except ImportError:
640
+ return "utf-8"
641
+
642
+
643
+ def _json_root_type(obj) -> Literal["object", "array", "scalar"]:
644
+ """Determine JSON root type for schema labeling."""
645
+ if isinstance(obj, dict):
646
+ return "object"
647
+ elif isinstance(obj, list):
648
+ return "array"
649
+ else:
650
+ return "scalar"
651
+
652
+
653
+ def _parse_pdf_keywords(raw: Optional[str]) -> List[str]:
654
+ """Parse PDF keywords string into a clean list."""
655
+ if not raw:
656
+ return []
657
+ # Normalize: split by common delimiters and strip
658
+ return [k.strip() for k in re.split(r"[,;|]", raw) if k.strip()]