sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.dist-info/METADATA +416 -0
- sirchmunk-0.0.1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
import enum
|
|
3
|
+
import json
|
|
4
|
+
import mimetypes
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
11
|
+
|
|
12
|
+
from loguru import logger
|
|
13
|
+
|
|
14
|
+
from sirchmunk.llm.openai_chat import OpenAIChat
|
|
15
|
+
from sirchmunk.schema.snapshot import SnapshotInfo
|
|
16
|
+
from sirchmunk.utils.file_utils import get_fast_hash
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FileType(enum.Enum):
|
|
20
|
+
"""
|
|
21
|
+
Enumeration of supported file types for specialized schema extraction.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
PLAINTEXT = "plaintext"
|
|
25
|
+
CSV = "csv"
|
|
26
|
+
JSON = "json"
|
|
27
|
+
IMAGE = "image"
|
|
28
|
+
PDF = "pdf"
|
|
29
|
+
EXCEL = "excel"
|
|
30
|
+
VIDEO = "video"
|
|
31
|
+
OTHER = "other"
|
|
32
|
+
DIRECTORY = "directory"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FileInfo:
|
|
37
|
+
"""Base file metadata schema for any file or directory on disk.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
file_or_url (Path): Absolute or relative path to the file/directory, or URL.
|
|
41
|
+
type (FileType): Type of the file (e.g., text, image, pdf, directory).
|
|
42
|
+
last_modified (datetime): Last modification time.
|
|
43
|
+
create_time (datetime): Creation time (or metadata change time on Unix).
|
|
44
|
+
do_snapshot (bool): Whether to capture a snapshot for this file.
|
|
45
|
+
snapshot (SnapshotInfo): Snapshot information (if do_snapshot is True).
|
|
46
|
+
|
|
47
|
+
Attributes (computed in __post_init__):
|
|
48
|
+
size_bytes (int): File size in bytes.
|
|
49
|
+
mime_type (str): MIME type inferred from file extension (e.g., "text/plain").
|
|
50
|
+
extension (str): Lowercase file extension (e.g., ".txt").
|
|
51
|
+
md5 (str): MD5 hash of the file. Empty string for directories. We use `md5_head` for quick fingerprinting.
|
|
52
|
+
cache_key (str): Unique cache key based on MD5 and size for change detection.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
file_or_url: Union[str, Path]
|
|
56
|
+
last_modified: datetime
|
|
57
|
+
create_time: datetime
|
|
58
|
+
type: FileType = field(default=FileType.PLAINTEXT)
|
|
59
|
+
do_snapshot: bool = field(default=True)
|
|
60
|
+
snapshot: SnapshotInfo = field(default_factory=SnapshotInfo)
|
|
61
|
+
|
|
62
|
+
size_bytes: int = field(init=False)
|
|
63
|
+
mime_type: str = field(init=False)
|
|
64
|
+
extension: str = field(init=False)
|
|
65
|
+
md5: str = field(init=False)
|
|
66
|
+
cache_key: str = field(init=False)
|
|
67
|
+
|
|
68
|
+
def __post_init__(self) -> None:
|
|
69
|
+
# TODO: add URLs support
|
|
70
|
+
self.file_or_url = Path(self.file_or_url)
|
|
71
|
+
self.extension = self.file_or_url.suffix.lower()
|
|
72
|
+
self.size_bytes = self.file_or_url.stat().st_size
|
|
73
|
+
self.mime_type = (
|
|
74
|
+
mimetypes.guess_type(self.file_or_url)[0] or "application/octet-stream"
|
|
75
|
+
)
|
|
76
|
+
self.md5 = self.get_file_md5(file_path=self.file_or_url)
|
|
77
|
+
self.cache_key = self.get_cache_key(file_or_url=self.file_or_url)
|
|
78
|
+
|
|
79
|
+
def base_kwargs(self) -> Dict[str, Any]:
|
|
80
|
+
"""Return a dict of fields that can be safely passed to child dataclass __init__.
|
|
81
|
+
|
|
82
|
+
Excludes fields with `init=False` (e.g., `extension`, `mime_type`), which are
|
|
83
|
+
computed in __post_init__ and must not be passed during initialization.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Dict[str, Any]: A dictionary containing only the init-accepted fields:
|
|
87
|
+
{"path", "type", "size_bytes", "last_modified", "create_time", "do_snapshot"}.
|
|
88
|
+
"""
|
|
89
|
+
# Get all fields declared in the dataclass (including inherited)
|
|
90
|
+
init_fields = {f.name for f in self.__dataclass_fields__.values() if f.init}
|
|
91
|
+
# Filter self.__dict__ to only include init-accepted fields
|
|
92
|
+
return {k: v for k, v in self.__dict__.items() if k in init_fields}
|
|
93
|
+
|
|
94
|
+
def to_dict(self):
|
|
95
|
+
"""Convert the FileInfo instance to a dictionary.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dict[str, Any]: A dictionary representation of the FileInfo instance.
|
|
99
|
+
"""
|
|
100
|
+
return {
|
|
101
|
+
"path": str(self.file_or_url),
|
|
102
|
+
"type": self.type.value,
|
|
103
|
+
"size_bytes": self.size_bytes,
|
|
104
|
+
"last_modified": self.last_modified.isoformat(),
|
|
105
|
+
"create_time": self.create_time.isoformat(),
|
|
106
|
+
"do_snapshot": self.do_snapshot,
|
|
107
|
+
"snapshot": self.snapshot.to_dict(),
|
|
108
|
+
"mime_type": self.mime_type,
|
|
109
|
+
"extension": self.extension,
|
|
110
|
+
"md5": self.md5,
|
|
111
|
+
"cache_key": self.cache_key,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def from_dict(info: Dict[str, Any]):
|
|
116
|
+
"""Create a FileInfo instance from a dictionary.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
info (Dict[str, Any]): A dictionary containing the fields of FileInfo.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
FileInfo: An instance of FileInfo populated with the provided data.
|
|
123
|
+
"""
|
|
124
|
+
return FileInfo(
|
|
125
|
+
file_or_url=Path(info["path"]),
|
|
126
|
+
type=FileType(info["type"]),
|
|
127
|
+
last_modified=datetime.fromisoformat(info["last_modified"]),
|
|
128
|
+
create_time=datetime.fromisoformat(info["create_time"]),
|
|
129
|
+
do_snapshot=info.get("do_snapshot", True),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def get_file_md5(file_path: Union[str, Path]) -> str:
|
|
134
|
+
"""
|
|
135
|
+
Get the MD5 hash of a file if it exists.
|
|
136
|
+
"""
|
|
137
|
+
file_path = Path(file_path)
|
|
138
|
+
return get_fast_hash(file_path=file_path) if file_path.is_file() else ""
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def get_cache_key(file_or_url: Union[str, Path]) -> str:
|
|
142
|
+
"""Generate a unique cache key for the file based on its path and MD5 hash.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
str: A unique cache key string.
|
|
146
|
+
"""
|
|
147
|
+
md5: str = FileInfo.get_file_md5(file_path=file_or_url)
|
|
148
|
+
size_bytes: int = Path(file_or_url).stat().st_size
|
|
149
|
+
cache_key: str = f"{md5}_{str(size_bytes)}" if md5 else ""
|
|
150
|
+
|
|
151
|
+
return cache_key
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def get_path_mtime(f_path: Union[str, Path], mtime: datetime) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Generate a unique identifier for a file based on its path and modification time for `unchanged` checking.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
f_path (Union[str, Path]): The file path.
|
|
160
|
+
mtime (datetime): The last modification time of the file.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
str: A unique identifier string in the format "path@ISO8601_mtime".
|
|
164
|
+
"""
|
|
165
|
+
f_path: str = str(Path(f_path).resolve())
|
|
166
|
+
return f"{f_path}@{mtime.isoformat()}"
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@dataclass
|
|
170
|
+
class TextFileSchema(FileInfo):
|
|
171
|
+
"""Schema for plain-text-like files (e.g., .txt, .md, .log, .py).
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
encoding (Optional[str], optional): Detected character encoding (e.g., "utf-8"). Defaults to None.
|
|
175
|
+
line_count (Optional[int], optional): Total number of lines — *not computed by default*. Defaults to None.
|
|
176
|
+
first_lines_preview (List[str], optional): First few lines (up to 5) for quick inspection. Defaults to empty list.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
encoding: Optional[str] = None
|
|
180
|
+
line_count: Optional[int] = None
|
|
181
|
+
first_lines_preview: List[str] = field(default_factory=list)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class CSVFileSchema(FileInfo):
|
|
186
|
+
"""Schema for CSV files.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
delimiter (str, optional): Field delimiter detected (e.g., ",", ";"). Defaults to ",".
|
|
190
|
+
has_header (bool, optional): Whether the first row appears to be a header. Defaults to True.
|
|
191
|
+
columns (List[str], optional): List of column names. Empty if header not detected. Defaults to empty list.
|
|
192
|
+
row_count (Optional[int], optional): Estimated or actual row count — *not computed by default*. Defaults to None.
|
|
193
|
+
sample_rows (List[Dict[str, Any]], optional): Sample of up to 3 parsed rows as dictionaries. Defaults to empty list.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
delimiter: str = field(default=",")
|
|
197
|
+
has_header: bool = field(default=True)
|
|
198
|
+
columns: List[str] = field(default_factory=list)
|
|
199
|
+
row_count: Optional[int] = field(default=None)
|
|
200
|
+
sample_rows: List[Dict[str, Any]] = field(default_factory=list)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass
|
|
204
|
+
class JSONFileSchema(FileInfo):
|
|
205
|
+
"""Schema for JSON files.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
root_type (Literal["object", "array", "scalar"], optional): Type of JSON root element. Defaults to "object".
|
|
209
|
+
inferred_schema (Dict[str, Any], optional): Inferred JSON Schema (structural summary). Defaults to empty dict.
|
|
210
|
+
is_valid_json (bool, optional): Whether the file is syntactically valid JSON. Defaults to True.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
root_type: Literal["object", "array", "scalar"] = "object"
|
|
214
|
+
inferred_schema: Dict[str, Any] = field(default_factory=dict)
|
|
215
|
+
is_valid_json: bool = field(default=True)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@dataclass
|
|
219
|
+
class ImageFileSchema(FileInfo):
|
|
220
|
+
"""Schema for image files (PNG, JPEG, etc.).
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
width (int): Image width in pixels.
|
|
224
|
+
height (int): Image height in pixels.
|
|
225
|
+
mode (str): PIL color mode (e.g., "RGB", "RGBA", "L").
|
|
226
|
+
format (str): Image format (e.g., "PNG", "JPEG").
|
|
227
|
+
color_profile (Optional[str], optional): Color profile type if embedded (e.g., "icc"). Defaults to None.
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
width: int = field(default=0)
|
|
231
|
+
height: int = field(default=0)
|
|
232
|
+
mode: str = field(default=None)
|
|
233
|
+
format: str = field(default=None)
|
|
234
|
+
color_profile: Optional[str] = field(default=None)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@dataclass
|
|
238
|
+
class PDFFileSchema(FileInfo):
|
|
239
|
+
"""Schema for PDF documents.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
page_count (int): Number of pages in the document.
|
|
243
|
+
author (Optional[str], optional): Document author from metadata. Defaults to None.
|
|
244
|
+
title (Optional[str], optional): Document title from metadata. Defaults to None.
|
|
245
|
+
keywords (List[str], optional): Keywords from metadata. Defaults to empty list.
|
|
246
|
+
is_encrypted (bool, optional): Whether the PDF is encrypted. Defaults to False.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
page_count: int = field(default=0)
|
|
250
|
+
author: Optional[str] = field(default=None)
|
|
251
|
+
title: Optional[str] = field(default=None)
|
|
252
|
+
keywords: List[str] = field(default_factory=list)
|
|
253
|
+
is_encrypted: bool = field(default=False)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
@dataclass
|
|
257
|
+
class ExcelFileSchema(FileInfo):
|
|
258
|
+
"""Schema for Excel workbooks (.xlsx, .xls).
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
sheet_names (List[str], optional): Names of all sheets in the workbook. Defaults to empty list.
|
|
262
|
+
sheet_schemas (Dict[str, CSVFileSchema], optional): Mapping from sheet name to its CSV-like schema.
|
|
263
|
+
Each sheet is treated as a tabular dataset. Defaults to empty dict.
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
sheet_names: List[str] = field(default_factory=list)
|
|
267
|
+
sheet_schemas: Dict[str, CSVFileSchema] = field(default_factory=dict)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@dataclass
|
|
271
|
+
class VideoFileSchema(FileInfo):
|
|
272
|
+
"""Schema for video files (MP4, AVI, MOV, etc.).
|
|
273
|
+
|
|
274
|
+
Extracts metadata via `ffprobe` (part of FFmpeg). Falls back to basic FileInfo if `ffprobe` is unavailable.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
duration_sec (float): Duration of the video in seconds.
|
|
278
|
+
width (int): Frame width in pixels.
|
|
279
|
+
height (int): Frame height in pixels.
|
|
280
|
+
codec (str): Video codec name (e.g., "h264", "hevc", "vp9").
|
|
281
|
+
framerate (float): Frames per second (e.g., 29.97, 60.0).
|
|
282
|
+
bitrate_kbps (int): Average video bitrate in kbps.
|
|
283
|
+
has_audio (bool): Whether the file contains an audio stream.
|
|
284
|
+
audio_codec (Optional[str], optional): Audio codec name if present (e.g., "aac", "mp3"). Defaults to None.
|
|
285
|
+
rotation (int, optional): Display rotation (0, 90, 180, 270) — from metadata tags. Defaults to 0.
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
duration_sec: float = field(default=0.0)
|
|
289
|
+
width: int = field(default=0)
|
|
290
|
+
height: int = field(default=0)
|
|
291
|
+
codec: str = field(default="")
|
|
292
|
+
framerate: float = field(default=0.0)
|
|
293
|
+
bitrate_kbps: int = field(default=0)
|
|
294
|
+
has_audio: bool = field(default=False)
|
|
295
|
+
audio_codec: Optional[str] = field(default=None)
|
|
296
|
+
rotation: int = field(default=0)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def build_file_schema(
|
|
300
|
+
path: Union[str, Path],
|
|
301
|
+
llm: Optional[OpenAIChat] = None,
|
|
302
|
+
) -> FileInfo:
|
|
303
|
+
"""
|
|
304
|
+
Build a typed schema object for a given file path, based on its type.
|
|
305
|
+
|
|
306
|
+
Supports: text, CSV, JSON, images, PDF, Excel, and video files.
|
|
307
|
+
Falls back to basic FileInfo if parsing fails or type is unknown.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
path (Path): Path to the file (must exist).
|
|
311
|
+
llm (OpenAIChat): The llm client of OpenAI api.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
FileInfo: An instance of FileInfo or one of its subclasses with type-specific metadata.
|
|
315
|
+
|
|
316
|
+
Raises:
|
|
317
|
+
FileNotFoundError: If the path does not exist.
|
|
318
|
+
"""
|
|
319
|
+
path = Path(path)
|
|
320
|
+
if not path.exists():
|
|
321
|
+
raise FileNotFoundError(f"Path not found: {path}")
|
|
322
|
+
|
|
323
|
+
stat = path.stat()
|
|
324
|
+
base_info = FileInfo(
|
|
325
|
+
file_or_url=path,
|
|
326
|
+
last_modified=datetime.fromtimestamp(stat.st_mtime),
|
|
327
|
+
create_time=datetime.fromtimestamp(stat.st_ctime),
|
|
328
|
+
do_snapshot=True,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if path.is_dir():
|
|
332
|
+
base_info.type = FileType.DIRECTORY
|
|
333
|
+
return base_info
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
# --- Text-like files ---
|
|
337
|
+
if base_info.extension in {
|
|
338
|
+
".txt",
|
|
339
|
+
".md",
|
|
340
|
+
".log",
|
|
341
|
+
".py",
|
|
342
|
+
".json",
|
|
343
|
+
".yml",
|
|
344
|
+
".yaml",
|
|
345
|
+
".xml",
|
|
346
|
+
}:
|
|
347
|
+
base_info.type = FileType.PLAINTEXT
|
|
348
|
+
schema = _build_text_schema(
|
|
349
|
+
file_info=base_info,
|
|
350
|
+
llm=llm,
|
|
351
|
+
)
|
|
352
|
+
if base_info.extension == ".json":
|
|
353
|
+
base_info.type = FileType.JSON
|
|
354
|
+
schema = _build_json_schema(base_info, schema)
|
|
355
|
+
return schema
|
|
356
|
+
|
|
357
|
+
# --- CSV ---
|
|
358
|
+
if base_info.extension == ".csv":
|
|
359
|
+
base_info.type = FileType.CSV
|
|
360
|
+
return _build_csv_schema(base_info)
|
|
361
|
+
|
|
362
|
+
# --- Images ---
|
|
363
|
+
if base_info.mime_type.startswith("image/"):
|
|
364
|
+
base_info.type = FileType.IMAGE
|
|
365
|
+
return _build_image_schema(base_info)
|
|
366
|
+
|
|
367
|
+
# --- PDF ---
|
|
368
|
+
if base_info.extension == ".pdf":
|
|
369
|
+
base_info.type = FileType.PDF
|
|
370
|
+
return _build_pdf_schema(base_info)
|
|
371
|
+
|
|
372
|
+
# --- Excel ---
|
|
373
|
+
if base_info.extension in {".xlsx", ".xls"}:
|
|
374
|
+
base_info.type = FileType.EXCEL
|
|
375
|
+
return _build_excel_schema(base_info)
|
|
376
|
+
|
|
377
|
+
# --- Video ---
|
|
378
|
+
if base_info.mime_type.startswith("video/") or base_info.extension in {
|
|
379
|
+
".mp4",
|
|
380
|
+
".avi",
|
|
381
|
+
".mov",
|
|
382
|
+
".mkv",
|
|
383
|
+
".webm",
|
|
384
|
+
".flv",
|
|
385
|
+
".wmv",
|
|
386
|
+
}:
|
|
387
|
+
base_info.type = FileType.VIDEO
|
|
388
|
+
video_schema = _build_video_schema(base_info)
|
|
389
|
+
if video_schema:
|
|
390
|
+
return video_schema
|
|
391
|
+
|
|
392
|
+
except Exception as e:
|
|
393
|
+
logger.warning(f"Error building schema for {path}: {e}")
|
|
394
|
+
|
|
395
|
+
# Fallback
|
|
396
|
+
return base_info
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _build_text_schema(
|
|
400
|
+
file_info: FileInfo,
|
|
401
|
+
llm: Optional[OpenAIChat] = None,
|
|
402
|
+
) -> TextFileSchema:
|
|
403
|
+
"""Build TextFileSchema by sampling the beginning of the file."""
|
|
404
|
+
try:
|
|
405
|
+
with open(file_info.file_or_url, "rb") as f:
|
|
406
|
+
raw = f.read(2048)
|
|
407
|
+
encoding = _detect_encoding(raw)
|
|
408
|
+
text = raw.decode(encoding, errors="replace")
|
|
409
|
+
lines = text.splitlines()
|
|
410
|
+
|
|
411
|
+
if file_info.do_snapshot:
|
|
412
|
+
from sirchmunk.schema.snapshot import TextSnapshot
|
|
413
|
+
|
|
414
|
+
snapshot_info: SnapshotInfo = TextSnapshot(
|
|
415
|
+
llm=llm,
|
|
416
|
+
).sampling(
|
|
417
|
+
file_path=file_info.file_or_url,
|
|
418
|
+
)
|
|
419
|
+
file_info.snapshot = snapshot_info
|
|
420
|
+
|
|
421
|
+
return TextFileSchema(
|
|
422
|
+
**file_info.base_kwargs(),
|
|
423
|
+
encoding=encoding,
|
|
424
|
+
first_lines_preview=lines[:5],
|
|
425
|
+
)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.warning(f"Error building text schema for {file_info.file_or_url}: {e}")
|
|
428
|
+
return TextFileSchema(**file_info.base_kwargs())
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _build_json_schema(file_info: FileInfo, fallback: TextFileSchema) -> JSONFileSchema:
|
|
432
|
+
"""Attempt to parse and infer schema for JSON files using Genson."""
|
|
433
|
+
try:
|
|
434
|
+
import json
|
|
435
|
+
|
|
436
|
+
from genson import SchemaBuilder
|
|
437
|
+
|
|
438
|
+
data = json.loads(
|
|
439
|
+
file_info.file_or_url.read_text(encoding=fallback.encoding or "utf-8")
|
|
440
|
+
)
|
|
441
|
+
builder = SchemaBuilder()
|
|
442
|
+
builder.add_object(data)
|
|
443
|
+
inferred = (
|
|
444
|
+
builder.to_schema()
|
|
445
|
+
) # Returns dict, e.g., {"type": "object", "properties": {...}}
|
|
446
|
+
|
|
447
|
+
return JSONFileSchema(
|
|
448
|
+
**file_info.base_kwargs(),
|
|
449
|
+
root_type=_json_root_type(data),
|
|
450
|
+
inferred_schema=inferred,
|
|
451
|
+
is_valid_json=True,
|
|
452
|
+
)
|
|
453
|
+
except Exception as e:
|
|
454
|
+
logger.warning(f"Error building JSON schema for {file_info.file_or_url}: {e}")
|
|
455
|
+
return JSONFileSchema(**file_info.base_kwargs(), is_valid_json=False)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _build_csv_schema(file_info: FileInfo) -> CSVFileSchema:
|
|
459
|
+
"""Build CSV schema using csv.Sniffer and sample parsing."""
|
|
460
|
+
import csv
|
|
461
|
+
|
|
462
|
+
with open(
|
|
463
|
+
file_info.file_or_url, newline="", encoding="utf-8", errors="ignore"
|
|
464
|
+
) as f:
|
|
465
|
+
sample = f.read(4096)
|
|
466
|
+
sniffer = csv.Sniffer()
|
|
467
|
+
dialect = sniffer.sniff(sample)
|
|
468
|
+
has_header = sniffer.has_header(sample)
|
|
469
|
+
f.seek(0)
|
|
470
|
+
reader = csv.DictReader(f, dialect=dialect)
|
|
471
|
+
columns = list(reader.fieldnames) if reader.fieldnames else []
|
|
472
|
+
sample_rows = [row for _, row in zip(range(3), reader)]
|
|
473
|
+
return CSVFileSchema(
|
|
474
|
+
**file_info.base_kwargs(),
|
|
475
|
+
delimiter=dialect.delimiter,
|
|
476
|
+
has_header=has_header,
|
|
477
|
+
columns=columns,
|
|
478
|
+
sample_rows=sample_rows,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _build_image_schema(file_info: FileInfo) -> ImageFileSchema:
|
|
483
|
+
"""Build image schema using PIL (lightweight metadata read)."""
|
|
484
|
+
from PIL import Image
|
|
485
|
+
|
|
486
|
+
with Image.open(file_info.file_or_url) as img:
|
|
487
|
+
return ImageFileSchema(
|
|
488
|
+
**file_info.base_kwargs(),
|
|
489
|
+
width=img.width,
|
|
490
|
+
height=img.height,
|
|
491
|
+
mode=img.mode,
|
|
492
|
+
format=img.format or "",
|
|
493
|
+
color_profile="icc" if "icc_profile" in img.info else None,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _build_pdf_schema(file_info: FileInfo) -> PDFFileSchema:
|
|
498
|
+
"""Build PDF schema using pypdf (streaming, no full load)."""
|
|
499
|
+
import pypdf
|
|
500
|
+
|
|
501
|
+
with open(file_info.file_or_url, "rb") as f:
|
|
502
|
+
reader = pypdf.PdfReader(f)
|
|
503
|
+
meta = reader.metadata or {}
|
|
504
|
+
return PDFFileSchema(
|
|
505
|
+
**file_info.base_kwargs(),
|
|
506
|
+
page_count=len(reader.pages),
|
|
507
|
+
author=meta.get("/Author"),
|
|
508
|
+
title=meta.get("/Title"),
|
|
509
|
+
keywords=_parse_pdf_keywords(meta.get("/Keywords")),
|
|
510
|
+
is_encrypted=reader.is_encrypted,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def _build_excel_schema(file_info: FileInfo) -> ExcelFileSchema:
|
|
515
|
+
"""Build Excel schema using pandas (reads only headers + few rows)."""
|
|
516
|
+
import pandas as pd
|
|
517
|
+
|
|
518
|
+
sheets = pd.read_excel(file_info.file_or_url, sheet_name=None, nrows=5)
|
|
519
|
+
sheet_schemas = {}
|
|
520
|
+
|
|
521
|
+
base_kwargs = {
|
|
522
|
+
"path": file_info.file_or_url,
|
|
523
|
+
"last_modified": file_info.last_modified,
|
|
524
|
+
"create_time": file_info.create_time,
|
|
525
|
+
"is_dir": False, # sheets are not dirs
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
for name, df in sheets.items():
|
|
529
|
+
cols = df.columns.tolist()
|
|
530
|
+
sample = df.head(3).to_dict(orient="records")
|
|
531
|
+
sheet_schemas[name] = CSVFileSchema(
|
|
532
|
+
**base_kwargs,
|
|
533
|
+
delimiter=",",
|
|
534
|
+
has_header=True,
|
|
535
|
+
columns=cols,
|
|
536
|
+
sample_rows=sample,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
return ExcelFileSchema(
|
|
540
|
+
**file_info.base_kwargs(),
|
|
541
|
+
sheet_names=list(sheets.keys()),
|
|
542
|
+
sheet_schemas=sheet_schemas,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _build_video_schema(file_info: FileInfo) -> Optional[VideoFileSchema]:
|
|
547
|
+
"""
|
|
548
|
+
Build video schema by calling `ffprobe` to extract metadata.
|
|
549
|
+
|
|
550
|
+
Args:
|
|
551
|
+
file_info (FileInfo): Base file info for the video file.
|
|
552
|
+
|
|
553
|
+
Returns:
|
|
554
|
+
VideoFileSchema if ffprobe succeeds; None otherwise (caller should fall back).
|
|
555
|
+
"""
|
|
556
|
+
try:
|
|
557
|
+
cmd = [
|
|
558
|
+
"ffprobe",
|
|
559
|
+
"-v",
|
|
560
|
+
"quiet",
|
|
561
|
+
"-print_format",
|
|
562
|
+
"json",
|
|
563
|
+
"-show_format",
|
|
564
|
+
"-show_streams",
|
|
565
|
+
str(file_info.file_or_url),
|
|
566
|
+
]
|
|
567
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
|
568
|
+
if result.returncode != 0:
|
|
569
|
+
return None
|
|
570
|
+
|
|
571
|
+
data = json.loads(result.stdout)
|
|
572
|
+
format_info = data.get("format", {})
|
|
573
|
+
streams = data.get("streams", [])
|
|
574
|
+
|
|
575
|
+
# Find best video stream
|
|
576
|
+
video_stream = next(
|
|
577
|
+
(s for s in streams if s.get("codec_type") == "video"), None
|
|
578
|
+
)
|
|
579
|
+
audio_stream = next(
|
|
580
|
+
(s for s in streams if s.get("codec_type") == "audio"), None
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
if not video_stream:
|
|
584
|
+
return None
|
|
585
|
+
|
|
586
|
+
# Extract key fields with safe defaults
|
|
587
|
+
width = int(video_stream.get("width", 0))
|
|
588
|
+
height = int(video_stream.get("height", 0))
|
|
589
|
+
codec = video_stream.get("codec_name", "unknown")
|
|
590
|
+
duration = float(format_info.get("duration", 0.0))
|
|
591
|
+
bitrate = int(int(format_info.get("bit_rate", 0)) // 1000) # bps → kbps
|
|
592
|
+
|
|
593
|
+
# Framerate: may be "30/1" or "2997/100"
|
|
594
|
+
fps_str = video_stream.get("avg_frame_rate", "0/1")
|
|
595
|
+
if "/" in fps_str:
|
|
596
|
+
num, den = map(int, fps_str.split("/"))
|
|
597
|
+
framerate = num / den if den != 0 else 0.0
|
|
598
|
+
else:
|
|
599
|
+
framerate = float(fps_str or 0.0)
|
|
600
|
+
|
|
601
|
+
# Rotation from metadata tags
|
|
602
|
+
rotation = 0
|
|
603
|
+
tags = video_stream.get("tags", {})
|
|
604
|
+
rotate_tag = tags.get("rotate") or tags.get("ROTATE")
|
|
605
|
+
if rotate_tag and re.match(r"^-?\d+$", rotate_tag):
|
|
606
|
+
rotation = int(rotate_tag) % 360
|
|
607
|
+
|
|
608
|
+
return VideoFileSchema(
|
|
609
|
+
**file_info.base_kwargs(),
|
|
610
|
+
duration_sec=duration,
|
|
611
|
+
width=width,
|
|
612
|
+
height=height,
|
|
613
|
+
codec=codec,
|
|
614
|
+
framerate=framerate,
|
|
615
|
+
bitrate_kbps=bitrate,
|
|
616
|
+
has_audio=audio_stream is not None,
|
|
617
|
+
audio_codec=audio_stream.get("codec_name") if audio_stream else None,
|
|
618
|
+
rotation=rotation,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
except (
|
|
622
|
+
subprocess.SubprocessError,
|
|
623
|
+
FileNotFoundError,
|
|
624
|
+
json.JSONDecodeError,
|
|
625
|
+
ValueError,
|
|
626
|
+
OSError,
|
|
627
|
+
) as e:
|
|
628
|
+
logger.warning(f"Error building video schema for {file_info.file_or_url}: {e}")
|
|
629
|
+
return None
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def _detect_encoding(raw: bytes) -> str:
|
|
633
|
+
"""Detect text encoding from raw bytes using charset_normalizer (fallback to utf-8)."""
|
|
634
|
+
try:
|
|
635
|
+
import charset_normalizer
|
|
636
|
+
|
|
637
|
+
result = charset_normalizer.from_bytes(raw).best()
|
|
638
|
+
return result.encoding if result else "utf-8"
|
|
639
|
+
except ImportError:
|
|
640
|
+
return "utf-8"
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _json_root_type(obj) -> Literal["object", "array", "scalar"]:
|
|
644
|
+
"""Determine JSON root type for schema labeling."""
|
|
645
|
+
if isinstance(obj, dict):
|
|
646
|
+
return "object"
|
|
647
|
+
elif isinstance(obj, list):
|
|
648
|
+
return "array"
|
|
649
|
+
else:
|
|
650
|
+
return "scalar"
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def _parse_pdf_keywords(raw: Optional[str]) -> List[str]:
|
|
654
|
+
"""Parse PDF keywords string into a clean list."""
|
|
655
|
+
if not raw:
|
|
656
|
+
return []
|
|
657
|
+
# Normalize: split by common delimiters and strip
|
|
658
|
+
return [k.strip() for k in re.split(r"[,;|]", raw) if k.strip()]
|