nia-sync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
extractor.py ADDED
@@ -0,0 +1,947 @@
1
+ """
2
+ Database and folder extractor for Nia Local Sync CLI.
3
+
4
+ Extracts text content from SQLite databases and folders,
5
+ converting them into virtual "files" for indexing.
6
+
7
+ Supported types:
8
+ - iMessage (~/Library/Messages/chat.db)
9
+ - Safari History (~/Library/Safari/History.db)
10
+ - Chrome/Brave/Edge History
11
+ - Firefox History (places.sqlite)
12
+ - Telegram (JSON export)
13
+ - Regular folders
14
+ """
15
+ import os
16
+ import re
17
+ import json
18
+ import sqlite3
19
+ import zipfile
20
+ import logging
21
+ from datetime import datetime, timezone
22
+ from typing import Any
23
+ from pathlib import Path
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # =============================================================================
28
+ # EXCLUSION PATTERNS - Synced with backend/utils/exclusion_patterns.py
29
+ # =============================================================================
30
+
31
+ # Directories to skip entirely (prevents os.walk from descending)
32
+ SKIP_DIRS = {
33
+ # VCS
34
+ ".git", ".svn", ".hg", ".bzr",
35
+ # Node/JS
36
+ "node_modules", ".npm", ".pnpm-store", ".yarn", "bower_components",
37
+ ".next", ".nuxt", ".output", ".svelte-kit", ".parcel-cache", ".cache", ".turbo",
38
+ # Python
39
+ "__pycache__", "venv", ".venv", "env", ".tox", ".nox",
40
+ ".pytest_cache", ".mypy_cache", ".ruff_cache", ".hypothesis", "htmlcov", ".Python",
41
+ # JVM
42
+ "target", ".gradle", ".m2",
43
+ # Rust
44
+ "target",
45
+ # Go
46
+ "vendor",
47
+ # Ruby
48
+ ".bundle",
49
+ # .NET
50
+ "bin", "obj", "packages",
51
+ # iOS/macOS
52
+ "DerivedData", "Pods", ".build",
53
+ # Build outputs
54
+ "dist", "build", "out", "output", "release", "debug", "coverage", ".nyc_output",
55
+ # IDE
56
+ ".idea", ".vscode", ".atom",
57
+ # OS
58
+ ".Spotlight-V100", ".Trashes",
59
+ # Misc
60
+ ".terraform", ".vagrant", ".docker", ".kube",
61
+ "logs", "log", "tmp", "temp",
62
+ ".aws", ".ssh",
63
+ }
64
+
65
+ # File extensions to skip (from backend exclusion_patterns.py)
66
+ SKIP_EXTENSIONS = {
67
+ # Security - keys/certs
68
+ ".pem", ".key", ".p12", ".pfx", ".crt", ".cer",
69
+ # Python compiled
70
+ ".pyc", ".pyo", ".pyd", ".egg",
71
+ # JVM
72
+ ".class", ".jar", ".war", ".ear",
73
+ # .NET
74
+ ".exe", ".pdb", ".nupkg",
75
+ # Compiled binaries
76
+ ".so", ".dylib", ".dll", ".o", ".obj", ".a", ".lib", ".wasm",
77
+ # Databases
78
+ ".sqlite", ".sqlite3", ".db", ".sql",
79
+ # Images
80
+ ".png", ".jpg", ".jpeg", ".gif", ".ico", ".webp", ".bmp", ".tiff", ".tif",
81
+ ".psd", ".ai", ".sketch", ".fig",
82
+ # Videos
83
+ ".mp4", ".avi", ".mov", ".wmv", ".webm", ".mkv", ".flv",
84
+ # Audio
85
+ ".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a",
86
+ # Documents
87
+ ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
88
+ # Archives
89
+ ".zip", ".tar", ".gz", ".tgz", ".rar", ".7z", ".bz2", ".xz",
90
+ # Fonts
91
+ ".woff", ".woff2", ".ttf", ".otf", ".eot",
92
+ # Logs/temp
93
+ ".log", ".tmp", ".temp", ".bak", ".backup", ".old", ".swp", ".swo",
94
+ # Coverage
95
+ ".lcov",
96
+ # IDE
97
+ ".code-workspace",
98
+ }
99
+
100
+ # Specific filenames to skip (from backend exclusion_patterns.py)
101
+ SKIP_FILES = {
102
+ # Lock files
103
+ "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "bun.lockb",
104
+ "poetry.lock", "Pipfile.lock", "Gemfile.lock", "composer.lock",
105
+ "Cargo.lock", "gradle.lockfile", "Package.resolved",
106
+ # OS files
107
+ ".DS_Store", "Thumbs.db", "desktop.ini", "ehthumbs.db",
108
+ # Security - credentials
109
+ ".env", ".envrc", ".npmrc", ".pypirc", ".netrc", ".htpasswd",
110
+ # Logs
111
+ "npm-debug.log", "yarn-debug.log", "yarn-error.log", ".pnpm-debug.log",
112
+ "pip-log.txt",
113
+ # IDE
114
+ ".project", ".classpath",
115
+ # Python
116
+ ".coverage",
117
+ }
118
+
119
+ # Patterns that match anywhere in the path (for files like id_rsa, credentials.json)
120
+ SKIP_PATH_PATTERNS = {
121
+ "credentials", "secrets", ".secret", ".secrets",
122
+ "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519",
123
+ }
124
+
125
+ # Type identifiers
126
+ TYPE_IMESSAGE = "imessage"
127
+ TYPE_SAFARI_HISTORY = "safari_history"
128
+ TYPE_CHROME_HISTORY = "chrome_history"
129
+ TYPE_FIREFOX_HISTORY = "firefox_history"
130
+ TYPE_TELEGRAM = "telegram"
131
+ TYPE_FOLDER = "folder"
132
+ TYPE_GENERIC_DB = "generic"
133
+
134
+ # Limits
135
+ MAX_ROWS = 100_000
136
+ MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10MB per file
137
+
138
+
139
+ def detect_source_type(path: str) -> str:
140
+ """
141
+ Auto-detect the type of source based on path and file structure.
142
+
143
+ Args:
144
+ path: Path to file or directory
145
+
146
+ Returns:
147
+ Type identifier string
148
+ """
149
+ # Check if directory (regular folder or telegram export)
150
+ if os.path.isdir(path):
151
+ if os.path.exists(os.path.join(path, "result.json")):
152
+ return TYPE_TELEGRAM
153
+ return TYPE_FOLDER
154
+
155
+ # Check for Telegram JSON export
156
+ if path.endswith(".json"):
157
+ try:
158
+ with open(path, "r", encoding="utf-8") as f:
159
+ data = json.load(f)
160
+ if "chats" in data and isinstance(data.get("chats"), dict):
161
+ return TYPE_TELEGRAM
162
+ except Exception:
163
+ pass
164
+
165
+ # Check for ZIP (could be Telegram export)
166
+ if zipfile.is_zipfile(path):
167
+ try:
168
+ with zipfile.ZipFile(path, "r") as zf:
169
+ names = zf.namelist()
170
+ if "result.json" in names or any(n.endswith("/result.json") for n in names):
171
+ return TYPE_TELEGRAM
172
+ except Exception:
173
+ pass
174
+
175
+ # Check SQLite databases
176
+ if not os.path.isfile(path):
177
+ return TYPE_FOLDER
178
+
179
+ try:
180
+ conn = sqlite3.connect(path)
181
+ cursor = conn.cursor()
182
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
183
+ tables = {row[0].lower() for row in cursor.fetchall()}
184
+ conn.close()
185
+
186
+ # iMessage
187
+ if "message" in tables and "handle" in tables and "chat" in tables:
188
+ return TYPE_IMESSAGE
189
+
190
+ # Safari History
191
+ if "history_items" in tables and "history_visits" in tables:
192
+ return TYPE_SAFARI_HISTORY
193
+
194
+ # Chrome/Brave/Edge History
195
+ if "urls" in tables and "visits" in tables and "keyword_search_terms" in tables:
196
+ return TYPE_CHROME_HISTORY
197
+
198
+ # Firefox History
199
+ if "moz_places" in tables and "moz_historyvisits" in tables:
200
+ return TYPE_FIREFOX_HISTORY
201
+
202
+ return TYPE_GENERIC_DB
203
+
204
+ except Exception:
205
+ return TYPE_FOLDER
206
+
207
+
208
+ def extract_incremental(
209
+ path: str,
210
+ source_type: str,
211
+ cursor: dict[str, Any] | None = None,
212
+ limit: int = MAX_ROWS,
213
+ ) -> dict[str, Any]:
214
+ """
215
+ Extract data incrementally from a source.
216
+
217
+ Args:
218
+ path: Path to the source
219
+ source_type: Type of source
220
+ cursor: Previous sync cursor (for incremental extraction)
221
+ limit: Maximum items to extract
222
+
223
+ Returns:
224
+ Dict with files, new cursor, and stats
225
+ """
226
+ cursor = cursor or {}
227
+
228
+ if source_type == TYPE_IMESSAGE:
229
+ return _extract_imessage(path, cursor, limit)
230
+ elif source_type == TYPE_SAFARI_HISTORY:
231
+ return _extract_safari_history(path, cursor, limit)
232
+ elif source_type == TYPE_CHROME_HISTORY:
233
+ return _extract_chrome_history(path, cursor, limit)
234
+ elif source_type == TYPE_FIREFOX_HISTORY:
235
+ return _extract_firefox_history(path, cursor, limit)
236
+ elif source_type == TYPE_TELEGRAM:
237
+ return _extract_telegram(path, cursor, limit)
238
+ elif source_type == TYPE_FOLDER:
239
+ return _extract_folder(path, cursor, limit)
240
+ else:
241
+ return _extract_generic_db(path, cursor, limit)
242
+
243
+
244
+ def _extract_imessage(
245
+ db_path: str,
246
+ cursor: dict[str, Any],
247
+ limit: int,
248
+ ) -> dict[str, Any]:
249
+ """Extract messages from iMessage chat.db."""
250
+ files = []
251
+ max_rowid = cursor.get("last_rowid", 0)
252
+ max_timestamp = cursor.get("last_timestamp", 0)
253
+ since_rowid = cursor.get("last_rowid")
254
+
255
+ conn = sqlite3.connect(db_path)
256
+ conn.row_factory = sqlite3.Row
257
+ cur = conn.cursor()
258
+
259
+ where_clauses = ["m.text IS NOT NULL", "m.text != ''"]
260
+ params = []
261
+
262
+ if since_rowid:
263
+ where_clauses.append("m.ROWID > ?")
264
+ params.append(since_rowid)
265
+
266
+ params.append(limit)
267
+
268
+ query = f"""
269
+ SELECT
270
+ m.ROWID as row_id,
271
+ m.text,
272
+ m.date,
273
+ m.is_from_me,
274
+ m.service,
275
+ h.id as contact_id,
276
+ COALESCE(h.uncanonicalized_id, h.id) as contact_display
277
+ FROM message m
278
+ LEFT JOIN handle h ON m.handle_id = h.ROWID
279
+ WHERE {' AND '.join(where_clauses)}
280
+ ORDER BY m.ROWID ASC
281
+ LIMIT ?
282
+ """
283
+
284
+ cur.execute(query, params)
285
+ rows = cur.fetchall()
286
+
287
+ for row in rows:
288
+ row_id = row["row_id"]
289
+ text = row["text"]
290
+ apple_date = row["date"]
291
+ is_from_me = row["is_from_me"]
292
+ contact_display = row["contact_display"] or row["contact_id"] or "unknown"
293
+
294
+ max_rowid = max(max_rowid, row_id)
295
+ if apple_date:
296
+ max_timestamp = max(max_timestamp, apple_date)
297
+
298
+ if not text or len(text.strip()) < 2:
299
+ continue
300
+
301
+ # Convert Apple date to ISO
302
+ try:
303
+ if apple_date:
304
+ unix_ts = (apple_date / 1_000_000_000) + 978307200
305
+ dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
306
+ timestamp_str = dt.isoformat()
307
+ date_prefix = dt.strftime("%Y-%m-%d")
308
+ else:
309
+ timestamp_str = None
310
+ date_prefix = "unknown"
311
+ except Exception:
312
+ timestamp_str = None
313
+ date_prefix = "unknown"
314
+
315
+ safe_contact = re.sub(r"[^\w\-_]", "_", str(contact_display))[:50]
316
+ direction = "sent" if is_from_me else "received"
317
+ file_path = f"messages/{safe_contact}/{date_prefix}_{row_id}_{direction}.txt"
318
+
319
+ files.append({
320
+ "path": file_path,
321
+ "content": text,
322
+ "metadata": {
323
+ "db_type": TYPE_IMESSAGE,
324
+ "row_id": row_id,
325
+ "timestamp": timestamp_str,
326
+ "contact": contact_display,
327
+ "is_from_me": bool(is_from_me),
328
+ },
329
+ })
330
+
331
+ conn.close()
332
+ logger.info(f"Extracted {len(files)} messages from iMessage")
333
+
334
+ return {
335
+ "files": files,
336
+ "cursor": {"last_rowid": max_rowid, "last_timestamp": max_timestamp},
337
+ "stats": {"extracted": len(files), "db_type": TYPE_IMESSAGE},
338
+ }
339
+
340
+
341
+ def _extract_safari_history(
342
+ db_path: str,
343
+ cursor: dict[str, Any],
344
+ limit: int,
345
+ ) -> dict[str, Any]:
346
+ """Extract browsing history from Safari History.db."""
347
+ files = []
348
+ max_visit_time = cursor.get("last_visit_time", 0)
349
+ since_visit_time = cursor.get("last_visit_time")
350
+
351
+ conn = sqlite3.connect(db_path)
352
+ conn.row_factory = sqlite3.Row
353
+ cur = conn.cursor()
354
+
355
+ where_clauses = ["hv.title IS NOT NULL", "hv.title != ''"]
356
+ params = []
357
+
358
+ if since_visit_time:
359
+ where_clauses.append("hv.visit_time > ?")
360
+ params.append(since_visit_time)
361
+
362
+ params.append(limit)
363
+
364
+ query = f"""
365
+ SELECT
366
+ hi.id,
367
+ hi.url,
368
+ hi.domain_expansion,
369
+ hv.title,
370
+ hv.visit_time
371
+ FROM history_visits hv
372
+ JOIN history_items hi ON hi.id = hv.history_item
373
+ WHERE {' AND '.join(where_clauses)}
374
+ ORDER BY hv.visit_time ASC
375
+ LIMIT ?
376
+ """
377
+
378
+ cur.execute(query, params)
379
+ rows = cur.fetchall()
380
+
381
+ for row in rows:
382
+ item_id = row["id"]
383
+ url = row["url"] or ""
384
+ domain = row["domain_expansion"] or ""
385
+ title = row["title"] or ""
386
+ visit_time = row["visit_time"]
387
+
388
+ max_visit_time = max(max_visit_time, visit_time or 0)
389
+
390
+ if not title.strip():
391
+ continue
392
+
393
+ try:
394
+ if visit_time:
395
+ unix_ts = visit_time + 978307200
396
+ dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
397
+ timestamp_str = dt.isoformat()
398
+ date_prefix = dt.strftime("%Y-%m-%d")
399
+ else:
400
+ timestamp_str = None
401
+ date_prefix = "unknown"
402
+ except Exception:
403
+ timestamp_str = None
404
+ date_prefix = "unknown"
405
+
406
+ content = f"{title}\n{url}"
407
+ safe_domain = re.sub(r"[^\w\-_]", "_", domain)[:30] if domain else "other"
408
+ file_path = f"history/{safe_domain}/{date_prefix}_{item_id}.txt"
409
+
410
+ files.append({
411
+ "path": file_path,
412
+ "content": content,
413
+ "metadata": {
414
+ "db_type": TYPE_SAFARI_HISTORY,
415
+ "row_id": item_id,
416
+ "timestamp": timestamp_str,
417
+ "url": url,
418
+ "domain": domain,
419
+ },
420
+ })
421
+
422
+ conn.close()
423
+ logger.info(f"Extracted {len(files)} history items from Safari")
424
+
425
+ return {
426
+ "files": files,
427
+ "cursor": {"last_visit_time": max_visit_time},
428
+ "stats": {"extracted": len(files), "db_type": TYPE_SAFARI_HISTORY},
429
+ }
430
+
431
+
432
+ def _extract_chrome_history(
433
+ db_path: str,
434
+ cursor: dict[str, Any],
435
+ limit: int,
436
+ ) -> dict[str, Any]:
437
+ """Extract browsing history from Chrome/Brave/Edge."""
438
+ files = []
439
+ max_visit_time = cursor.get("last_visit_time", 0)
440
+ since_visit_time = cursor.get("last_visit_time")
441
+
442
+ conn = sqlite3.connect(db_path)
443
+ conn.row_factory = sqlite3.Row
444
+ cur = conn.cursor()
445
+
446
+ where_clauses = ["u.title IS NOT NULL", "u.title != ''"]
447
+ params = []
448
+
449
+ if since_visit_time:
450
+ where_clauses.append("v.visit_time > ?")
451
+ params.append(since_visit_time)
452
+
453
+ params.append(limit)
454
+
455
+ query = f"""
456
+ SELECT
457
+ u.id,
458
+ u.url,
459
+ u.title,
460
+ v.visit_time
461
+ FROM visits v
462
+ JOIN urls u ON u.id = v.url
463
+ WHERE {' AND '.join(where_clauses)}
464
+ ORDER BY v.visit_time ASC
465
+ LIMIT ?
466
+ """
467
+
468
+ cur.execute(query, params)
469
+ rows = cur.fetchall()
470
+
471
+ for row in rows:
472
+ url_id = row["id"]
473
+ url = row["url"] or ""
474
+ title = row["title"] or ""
475
+ visit_time = row["visit_time"]
476
+
477
+ max_visit_time = max(max_visit_time, visit_time or 0)
478
+
479
+ if not title.strip():
480
+ continue
481
+
482
+ try:
483
+ if visit_time:
484
+ unix_ts = (visit_time / 1_000_000) - 11644473600
485
+ dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
486
+ timestamp_str = dt.isoformat()
487
+ date_prefix = dt.strftime("%Y-%m-%d")
488
+ else:
489
+ timestamp_str = None
490
+ date_prefix = "unknown"
491
+ except Exception:
492
+ timestamp_str = None
493
+ date_prefix = "unknown"
494
+
495
+ try:
496
+ from urllib.parse import urlparse
497
+ domain = urlparse(url).netloc or "other"
498
+ except Exception:
499
+ domain = "other"
500
+
501
+ content = f"{title}\n{url}"
502
+ safe_domain = re.sub(r"[^\w\-_]", "_", domain)[:30]
503
+ file_path = f"history/{safe_domain}/{date_prefix}_{url_id}.txt"
504
+
505
+ files.append({
506
+ "path": file_path,
507
+ "content": content,
508
+ "metadata": {
509
+ "db_type": TYPE_CHROME_HISTORY,
510
+ "row_id": url_id,
511
+ "timestamp": timestamp_str,
512
+ "url": url,
513
+ "domain": domain,
514
+ },
515
+ })
516
+
517
+ conn.close()
518
+ logger.info(f"Extracted {len(files)} history items from Chrome")
519
+
520
+ return {
521
+ "files": files,
522
+ "cursor": {"last_visit_time": max_visit_time},
523
+ "stats": {"extracted": len(files), "db_type": TYPE_CHROME_HISTORY},
524
+ }
525
+
526
+
527
+ def _extract_firefox_history(
528
+ db_path: str,
529
+ cursor: dict[str, Any],
530
+ limit: int,
531
+ ) -> dict[str, Any]:
532
+ """Extract browsing history from Firefox places.sqlite."""
533
+ files = []
534
+ max_visit_date = cursor.get("last_visit_date", 0)
535
+ since_visit_date = cursor.get("last_visit_date")
536
+
537
+ conn = sqlite3.connect(db_path)
538
+ conn.row_factory = sqlite3.Row
539
+ cur = conn.cursor()
540
+
541
+ where_clauses = ["p.title IS NOT NULL", "p.title != ''"]
542
+ params = []
543
+
544
+ if since_visit_date:
545
+ where_clauses.append("h.visit_date > ?")
546
+ params.append(since_visit_date)
547
+
548
+ params.append(limit)
549
+
550
+ query = f"""
551
+ SELECT
552
+ p.id,
553
+ p.url,
554
+ p.title,
555
+ h.visit_date
556
+ FROM moz_places p
557
+ JOIN moz_historyvisits h ON p.id = h.place_id
558
+ WHERE {' AND '.join(where_clauses)}
559
+ ORDER BY h.visit_date ASC
560
+ LIMIT ?
561
+ """
562
+
563
+ cur.execute(query, params)
564
+ rows = cur.fetchall()
565
+
566
+ for row in rows:
567
+ place_id = row["id"]
568
+ url = row["url"] or ""
569
+ title = row["title"] or ""
570
+ visit_date = row["visit_date"]
571
+
572
+ max_visit_date = max(max_visit_date, visit_date or 0)
573
+
574
+ if not title.strip():
575
+ continue
576
+
577
+ try:
578
+ if visit_date:
579
+ unix_ts = visit_date / 1_000_000
580
+ dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
581
+ timestamp_str = dt.isoformat()
582
+ date_prefix = dt.strftime("%Y-%m-%d")
583
+ else:
584
+ timestamp_str = None
585
+ date_prefix = "unknown"
586
+ except Exception:
587
+ timestamp_str = None
588
+ date_prefix = "unknown"
589
+
590
+ try:
591
+ from urllib.parse import urlparse
592
+ domain = urlparse(url).netloc or "other"
593
+ except Exception:
594
+ domain = "other"
595
+
596
+ content = f"{title}\n{url}"
597
+ safe_domain = re.sub(r"[^\w\-_]", "_", domain)[:30]
598
+ file_path = f"history/{safe_domain}/{date_prefix}_{place_id}.txt"
599
+
600
+ files.append({
601
+ "path": file_path,
602
+ "content": content,
603
+ "metadata": {
604
+ "db_type": TYPE_FIREFOX_HISTORY,
605
+ "row_id": place_id,
606
+ "timestamp": timestamp_str,
607
+ "url": url,
608
+ "domain": domain,
609
+ },
610
+ })
611
+
612
+ conn.close()
613
+ logger.info(f"Extracted {len(files)} history items from Firefox")
614
+
615
+ return {
616
+ "files": files,
617
+ "cursor": {"last_visit_date": max_visit_date},
618
+ "stats": {"extracted": len(files), "db_type": TYPE_FIREFOX_HISTORY},
619
+ }
620
+
621
+
622
+ def _extract_telegram(
623
+ export_path: str,
624
+ cursor: dict[str, Any],
625
+ limit: int,
626
+ ) -> dict[str, Any]:
627
+ """Extract messages from Telegram export (JSON or ZIP)."""
628
+ files = []
629
+ max_message_id = cursor.get("last_message_id", 0)
630
+ since_message_id = cursor.get("last_message_id")
631
+ extracted_count = 0
632
+
633
+ # Handle different input types
634
+ if zipfile.is_zipfile(export_path):
635
+ import tempfile
636
+ with tempfile.TemporaryDirectory() as temp_dir:
637
+ with zipfile.ZipFile(export_path, "r") as zf:
638
+ zf.extractall(temp_dir)
639
+ result_file = os.path.join(temp_dir, "result.json")
640
+ if not os.path.exists(result_file):
641
+ for item in os.listdir(temp_dir):
642
+ subdir = os.path.join(temp_dir, item)
643
+ if os.path.isdir(subdir):
644
+ candidate = os.path.join(subdir, "result.json")
645
+ if os.path.exists(candidate):
646
+ result_file = candidate
647
+ break
648
+ return _extract_telegram_json(result_file, cursor, limit)
649
+ elif os.path.isdir(export_path):
650
+ result_file = os.path.join(export_path, "result.json")
651
+ else:
652
+ result_file = export_path
653
+
654
+ return _extract_telegram_json(result_file, cursor, limit)
655
+
656
+
657
+ def _extract_telegram_json(
658
+ result_file: str,
659
+ cursor: dict[str, Any],
660
+ limit: int,
661
+ ) -> dict[str, Any]:
662
+ """Extract from Telegram result.json file."""
663
+ files = []
664
+ max_message_id = cursor.get("last_message_id", 0)
665
+ since_message_id = cursor.get("last_message_id")
666
+ extracted_count = 0
667
+
668
+ with open(result_file, "r", encoding="utf-8") as f:
669
+ data = json.load(f)
670
+
671
+ chats_data = data.get("chats", {})
672
+ chats = chats_data.get("list", []) if isinstance(chats_data, dict) else []
673
+
674
+ for chat in chats:
675
+ if extracted_count >= limit:
676
+ break
677
+
678
+ chat_name = chat.get("name", "Unknown Chat")
679
+ chat_type = chat.get("type", "personal_chat")
680
+ chat_id = chat.get("id", 0)
681
+
682
+ messages = chat.get("messages", [])
683
+
684
+ for msg in messages:
685
+ if extracted_count >= limit:
686
+ break
687
+
688
+ msg_id = msg.get("id")
689
+ msg_type = msg.get("type", "message")
690
+
691
+ if since_message_id and msg_id and msg_id <= since_message_id:
692
+ continue
693
+
694
+ if msg_type != "message":
695
+ continue
696
+
697
+ # Handle text (can be string or list)
698
+ text_content = msg.get("text", "")
699
+ if isinstance(text_content, list):
700
+ parts = []
701
+ for item in text_content:
702
+ if isinstance(item, dict):
703
+ parts.append(item.get("text", ""))
704
+ elif isinstance(item, str):
705
+ parts.append(item)
706
+ text_content = "".join(parts)
707
+
708
+ if not text_content or not text_content.strip():
709
+ continue
710
+
711
+ if msg_id:
712
+ max_message_id = max(max_message_id, msg_id)
713
+
714
+ date_str = msg.get("date", "")
715
+ try:
716
+ if date_str:
717
+ dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
718
+ timestamp_str = dt.isoformat()
719
+ date_prefix = dt.strftime("%Y-%m-%d")
720
+ else:
721
+ timestamp_str = None
722
+ date_prefix = "unknown"
723
+ except Exception:
724
+ timestamp_str = date_str
725
+ date_prefix = "unknown"
726
+
727
+ from_name = msg.get("from", "") or msg.get("actor", "") or "Unknown"
728
+
729
+ safe_chat = re.sub(r"[^\w\-_]", "_", chat_name)[:50]
730
+ file_path = f"telegram/{safe_chat}/{date_prefix}_{msg_id}.txt"
731
+
732
+ files.append({
733
+ "path": file_path,
734
+ "content": text_content,
735
+ "metadata": {
736
+ "db_type": TYPE_TELEGRAM,
737
+ "chat_name": chat_name,
738
+ "chat_type": chat_type,
739
+ "message_id": msg_id,
740
+ "timestamp": timestamp_str,
741
+ "from_name": from_name,
742
+ },
743
+ })
744
+ extracted_count += 1
745
+
746
+ logger.info(f"Extracted {len(files)} messages from Telegram")
747
+
748
+ return {
749
+ "files": files,
750
+ "cursor": {"last_message_id": max_message_id},
751
+ "stats": {"extracted": len(files), "db_type": TYPE_TELEGRAM},
752
+ }
753
+
754
+
755
+ def _extract_folder(
756
+ folder_path: str,
757
+ cursor: dict[str, Any],
758
+ limit: int,
759
+ ) -> dict[str, Any]:
760
+ """Extract text files from a regular folder with proper exclusion patterns."""
761
+ files = []
762
+ last_mtime = cursor.get("last_mtime", 0)
763
+ max_mtime = last_mtime
764
+ extracted_count = 0
765
+
766
+ # Allowed text file extensions
767
+ text_extensions = {
768
+ ".txt", ".md", ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
769
+ ".html", ".css", ".scss", ".less", ".xml", ".csv", ".sh", ".bash", ".zsh",
770
+ ".rs", ".go", ".java", ".c", ".cpp", ".h", ".hpp", ".rb", ".vue", ".svelte",
771
+ ".php", ".swift", ".kt", ".scala", ".r", ".sql", ".toml", ".ini", ".cfg",
772
+ ".makefile", ".dockerfile", ".gitignore", ".editorconfig",
773
+ }
774
+
775
+ for root, dirs, filenames in os.walk(folder_path, topdown=True):
776
+ if extracted_count >= limit:
777
+ break
778
+
779
+ # Filter out excluded directories IN-PLACE to prevent os.walk from descending
780
+ dirs[:] = [
781
+ d for d in dirs
782
+ if d not in SKIP_DIRS
783
+ and not d.startswith(".")
784
+ and not d.endswith(".egg-info")
785
+ ]
786
+
787
+ for filename in filenames:
788
+ if extracted_count >= limit:
789
+ break
790
+
791
+ # Skip by filename
792
+ if filename in SKIP_FILES:
793
+ continue
794
+
795
+ # Skip hidden files
796
+ if filename.startswith("."):
797
+ continue
798
+
799
+ # Skip files matching security patterns (credentials, secrets, keys)
800
+ filename_lower = filename.lower()
801
+ if any(pattern in filename_lower for pattern in SKIP_PATH_PATTERNS):
802
+ continue
803
+
804
+ ext = Path(filename).suffix.lower()
805
+
806
+ # Skip by extension
807
+ if ext in SKIP_EXTENSIONS:
808
+ continue
809
+
810
+ # Only include known text extensions
811
+ if ext and ext not in text_extensions:
812
+ continue
813
+
814
+ file_path = os.path.join(root, filename)
815
+
816
+ try:
817
+ stat = os.stat(file_path)
818
+ mtime = stat.st_mtime
819
+
820
+ # Skip if not modified since last sync
821
+ if mtime <= last_mtime:
822
+ continue
823
+
824
+ # Skip large files
825
+ if stat.st_size > MAX_FILE_SIZE_BYTES:
826
+ continue
827
+
828
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
829
+ content = f.read()
830
+
831
+ if not content.strip():
832
+ continue
833
+
834
+ max_mtime = max(max_mtime, mtime)
835
+
836
+ # Relative path from folder root
837
+ rel_path = os.path.relpath(file_path, folder_path)
838
+
839
+ files.append({
840
+ "path": rel_path,
841
+ "content": content,
842
+ "metadata": {
843
+ "db_type": TYPE_FOLDER,
844
+ "extension": ext,
845
+ "mtime": mtime,
846
+ },
847
+ })
848
+ extracted_count += 1
849
+
850
+ except (PermissionError, IOError) as e:
851
+ logger.warning(f"Could not read {file_path}: {e}")
852
+ continue
853
+
854
+ logger.info(f"Extracted {len(files)} files from folder")
855
+
856
+ return {
857
+ "files": files,
858
+ "cursor": {"last_mtime": max_mtime},
859
+ "stats": {"extracted": len(files), "db_type": TYPE_FOLDER},
860
+ }
861
+
862
+
863
+ def _extract_generic_db(
864
+ db_path: str,
865
+ cursor: dict[str, Any],
866
+ limit: int,
867
+ ) -> dict[str, Any]:
868
+ """Extract from generic SQLite database."""
869
+ files = []
870
+ total_extracted = 0
871
+
872
+ skip_tables = {"sqlite_sequence", "sqlite_stat1", "sqlite_stat4"}
873
+
874
+ conn = sqlite3.connect(db_path)
875
+ cur = conn.cursor()
876
+
877
+ cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
878
+ tables = [row[0] for row in cur.fetchall()]
879
+
880
+ for table_name in tables:
881
+ if table_name.lower() in skip_tables:
882
+ continue
883
+
884
+ if total_extracted >= limit:
885
+ break
886
+
887
+ try:
888
+ cur.execute(f'PRAGMA table_info("{table_name}")')
889
+ columns = cur.fetchall()
890
+
891
+ text_columns = [
892
+ col[1]
893
+ for col in columns
894
+ if col[2].upper() in ("TEXT", "VARCHAR", "CHAR", "CLOB")
895
+ ]
896
+
897
+ if not text_columns:
898
+ continue
899
+
900
+ pk_column = next((col[1] for col in columns if col[5] == 1), "rowid")
901
+
902
+ select_cols = [f'"{pk_column}"'] + [f'"{col}"' for col in text_columns]
903
+ remaining = limit - total_extracted
904
+ query = f'SELECT {", ".join(select_cols)} FROM "{table_name}" LIMIT ?'
905
+
906
+ cur.execute(query, (remaining,))
907
+ rows = cur.fetchall()
908
+
909
+ for row in rows:
910
+ pk_value = row[0]
911
+ text_values = row[1:]
912
+
913
+ combined = []
914
+ for col_name, value in zip(text_columns, text_values):
915
+ if value and isinstance(value, str) and value.strip():
916
+ combined.append(f"{col_name}: {value}")
917
+
918
+ if not combined:
919
+ continue
920
+
921
+ content = "\n".join(combined)
922
+ safe_table = re.sub(r"[^\w\-_]", "_", table_name)
923
+ file_path = f"{safe_table}/row_{pk_value}.txt"
924
+
925
+ files.append({
926
+ "path": file_path,
927
+ "content": content,
928
+ "metadata": {
929
+ "db_type": TYPE_GENERIC_DB,
930
+ "table": table_name,
931
+ "row_id": pk_value,
932
+ },
933
+ })
934
+ total_extracted += 1
935
+
936
+ except Exception as e:
937
+ logger.warning(f"Error extracting from table {table_name}: {e}")
938
+ continue
939
+
940
+ conn.close()
941
+ logger.info(f"Extracted {len(files)} rows from generic database")
942
+
943
+ return {
944
+ "files": files,
945
+ "cursor": {},
946
+ "stats": {"extracted": len(files), "db_type": TYPE_GENERIC_DB},
947
+ }